mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 21:39:57 +00:00
Compare commits
9 Commits
v0.1.2-dev
...
v0.1.4-pyt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2b26775ed1 | ||
|
|
306ada5cb8 | ||
|
|
d3aa8bfbc5 | ||
|
|
04d97347d7 | ||
|
|
22aa8a93c2 | ||
|
|
f485378ea4 | ||
|
|
f923cfe47f | ||
|
|
06cb7b6458 | ||
|
|
bdef634954 |
4
.github/workflows/python.yml
vendored
4
.github/workflows/python.yml
vendored
@@ -31,6 +31,7 @@ jobs:
|
|||||||
- name: Install lancedb
|
- name: Install lancedb
|
||||||
run: |
|
run: |
|
||||||
pip install -e .
|
pip install -e .
|
||||||
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
pip install pytest
|
pip install pytest
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -x -v --durations=30 tests
|
run: pytest -x -v --durations=30 tests
|
||||||
@@ -49,10 +50,11 @@ jobs:
|
|||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: "3.11"
|
||||||
- name: Install lancedb
|
- name: Install lancedb
|
||||||
run: |
|
run: |
|
||||||
pip install -e .
|
pip install -e .
|
||||||
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
pip install pytest
|
pip install pytest
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest -x -v --durations=30 tests
|
run: pytest -x -v --durations=30 tests
|
||||||
9
Cargo.lock
generated
9
Cargo.lock
generated
@@ -1052,6 +1052,7 @@ dependencies = [
|
|||||||
"paste",
|
"paste",
|
||||||
"petgraph",
|
"petgraph",
|
||||||
"rand",
|
"rand",
|
||||||
|
"regex",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1645,9 +1646,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lance"
|
name = "lance"
|
||||||
version = "0.4.12"
|
version = "0.4.17"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fc96cf89139af6f439a0e28ccd04ddf81be795b79fda3105b7a8952fadeb778e"
|
checksum = "86dda8185bd1ffae7b910c1f68035af23be9b717c52e9cc4de176cd30b47f772"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"accelerate-src",
|
"accelerate-src",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -1684,6 +1685,7 @@ dependencies = [
|
|||||||
"rand",
|
"rand",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"shellexpand",
|
"shellexpand",
|
||||||
|
"snafu",
|
||||||
"sqlparser-lance",
|
"sqlparser-lance",
|
||||||
"tokio",
|
"tokio",
|
||||||
"url",
|
"url",
|
||||||
@@ -3359,8 +3361,11 @@ name = "vectordb"
|
|||||||
version = "0.0.1"
|
version = "0.0.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow-array",
|
"arrow-array",
|
||||||
|
"arrow-data",
|
||||||
"arrow-schema",
|
"arrow-schema",
|
||||||
"lance",
|
"lance",
|
||||||
|
"object_store",
|
||||||
|
"rand",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ nav:
|
|||||||
- Basics: basic.md
|
- Basics: basic.md
|
||||||
- Embeddings: embedding.md
|
- Embeddings: embedding.md
|
||||||
- Indexing: ann_indexes.md
|
- Indexing: ann_indexes.md
|
||||||
|
- Full-text search: fts.md
|
||||||
- Integrations: integrations.md
|
- Integrations: integrations.md
|
||||||
- Python API: python.md
|
- Python API: python.md
|
||||||
|
|
||||||
|
|||||||
51
docs/src/fts.md
Normal file
51
docs/src/fts.md
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# [EXPERIMENTAL] Full text search
|
||||||
|
|
||||||
|
LanceDB now provides experimental support for full text search.
|
||||||
|
This is currently Python only. We plan to push the integration down to Rust in the future
|
||||||
|
to make this available for JS as well.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
To use full text search, you must install optional dependency tantivy-py:
|
||||||
|
|
||||||
|
# tantivy 0.19.2
|
||||||
|
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||||
|
|
||||||
|
|
||||||
|
## Quickstart
|
||||||
|
|
||||||
|
Assume:
|
||||||
|
1. `table` is a LanceDB Table
|
||||||
|
2. `text` is the name of the Table column that we want to index
|
||||||
|
|
||||||
|
To create the index:
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.create_fts_index("text")
|
||||||
|
```
|
||||||
|
|
||||||
|
To search:
|
||||||
|
|
||||||
|
```python
|
||||||
|
df = table.search("puppy").limit(10).select(["text"]).to_df()
|
||||||
|
```
|
||||||
|
|
||||||
|
LanceDB automatically looks for an FTS index if the input is str.
|
||||||
|
|
||||||
|
## Multiple text columns
|
||||||
|
|
||||||
|
If you have multiple columns to index, pass them all as a list to `create_fts_index`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.create_fts_index(["text1", "text2"])
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the search API call does not change - you can search over all indexed columns at once.
|
||||||
|
|
||||||
|
## Current limitations
|
||||||
|
|
||||||
|
1. Currently we do not yet support incremental writes.
|
||||||
|
If you add data after fts index creation, it won't be reflected
|
||||||
|
in search results until you do a full reindex.
|
||||||
|
|
||||||
|
2. We currently only support local filesystem paths for the fts index.
|
||||||
@@ -45,5 +45,6 @@ We will be adding completed demo apps built using LanceDB.
|
|||||||
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
|
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
|
||||||
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
|
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
|
||||||
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
|
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
|
||||||
|
* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
|
||||||
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
|
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
|
||||||
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.
|
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.
|
||||||
|
|||||||
@@ -15,15 +15,16 @@
|
|||||||
import {
|
import {
|
||||||
Field,
|
Field,
|
||||||
Float32,
|
Float32,
|
||||||
List,
|
List, type ListBuilder,
|
||||||
makeBuilder,
|
makeBuilder,
|
||||||
RecordBatchFileWriter,
|
RecordBatchFileWriter,
|
||||||
Table,
|
Table, Utf8,
|
||||||
type Vector,
|
type Vector,
|
||||||
vectorFromArray
|
vectorFromArray
|
||||||
} from 'apache-arrow'
|
} from 'apache-arrow'
|
||||||
|
import { type EmbeddingFunction } from './index'
|
||||||
|
|
||||||
export function convertToTable (data: Array<Record<string, unknown>>): Table {
|
export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Table {
|
||||||
if (data.length === 0) {
|
if (data.length === 0) {
|
||||||
throw new Error('At least one record needs to be provided')
|
throw new Error('At least one record needs to be provided')
|
||||||
}
|
}
|
||||||
@@ -33,11 +34,7 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
|
|||||||
|
|
||||||
for (const columnsKey of columns) {
|
for (const columnsKey of columns) {
|
||||||
if (columnsKey === 'vector') {
|
if (columnsKey === 'vector') {
|
||||||
const children = new Field<Float32>('item', new Float32())
|
const listBuilder = newVectorListBuilder()
|
||||||
const list = new List(children)
|
|
||||||
const listBuilder = makeBuilder({
|
|
||||||
type: list
|
|
||||||
})
|
|
||||||
const vectorSize = (data[0].vector as any[]).length
|
const vectorSize = (data[0].vector as any[]).length
|
||||||
for (const datum of data) {
|
for (const datum of data) {
|
||||||
if ((datum[columnsKey] as any[]).length !== vectorSize) {
|
if ((datum[columnsKey] as any[]).length !== vectorSize) {
|
||||||
@@ -52,15 +49,37 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
|
|||||||
for (const datum of data) {
|
for (const datum of data) {
|
||||||
values.push(datum[columnsKey])
|
values.push(datum[columnsKey])
|
||||||
}
|
}
|
||||||
records[columnsKey] = vectorFromArray(values)
|
|
||||||
|
if (columnsKey === embeddings?.sourceColumn) {
|
||||||
|
const vectors = embeddings.embed(values as T[])
|
||||||
|
const listBuilder = newVectorListBuilder()
|
||||||
|
vectors.map(v => listBuilder.append(v))
|
||||||
|
records.vector = listBuilder.finish().toVector()
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof values[0] === 'string') {
|
||||||
|
// `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
|
||||||
|
records[columnsKey] = vectorFromArray(values, new Utf8())
|
||||||
|
} else {
|
||||||
|
records[columnsKey] = vectorFromArray(values)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Table(records)
|
return new Table(records)
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function fromRecordsToBuffer (data: Array<Record<string, unknown>>): Promise<Buffer> {
|
// Creates a new Arrow ListBuilder that stores a Vector column
|
||||||
const table = convertToTable(data)
|
function newVectorListBuilder (): ListBuilder<Float32, any> {
|
||||||
|
const children = new Field<Float32>('item', new Float32())
|
||||||
|
const list = new List(children)
|
||||||
|
return makeBuilder({
|
||||||
|
type: list
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fromRecordsToBuffer<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Buffer> {
|
||||||
|
const table = convertToTable(data, embeddings)
|
||||||
const writer = RecordBatchFileWriter.writeAll(table)
|
const writer = RecordBatchFileWriter.writeAll(table)
|
||||||
return Buffer.from(await writer.toUint8Array())
|
return Buffer.from(await writer.toUint8Array())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,14 +21,15 @@ import {
|
|||||||
import { fromRecordsToBuffer } from './arrow'
|
import { fromRecordsToBuffer } from './arrow'
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||||
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd } = require('../native.js')
|
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd, tableCreateVectorIndex } = require('../native.js')
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Connect to a LanceDB instance at the given URI
|
* Connect to a LanceDB instance at the given URI
|
||||||
* @param uri The uri of the database.
|
* @param uri The uri of the database.
|
||||||
*/
|
*/
|
||||||
export async function connect (uri: string): Promise<Connection> {
|
export async function connect (uri: string): Promise<Connection> {
|
||||||
return new Connection(uri)
|
const db = await databaseNew(uri)
|
||||||
|
return new Connection(db, uri)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -38,9 +39,9 @@ export class Connection {
|
|||||||
private readonly _uri: string
|
private readonly _uri: string
|
||||||
private readonly _db: any
|
private readonly _db: any
|
||||||
|
|
||||||
constructor (uri: string) {
|
constructor (db: any, uri: string) {
|
||||||
this._uri = uri
|
this._uri = uri
|
||||||
this._db = databaseNew(uri)
|
this._db = db
|
||||||
}
|
}
|
||||||
|
|
||||||
get uri (): string {
|
get uri (): string {
|
||||||
@@ -55,17 +56,50 @@ export class Connection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open a table in the database.
|
* Open a table in the database.
|
||||||
* @param name The name of the table.
|
*
|
||||||
*/
|
* @param name The name of the table.
|
||||||
async openTable (name: string): Promise<Table> {
|
*/
|
||||||
|
async openTable (name: string): Promise<Table>
|
||||||
|
/**
|
||||||
|
* Open a table in the database.
|
||||||
|
*
|
||||||
|
* @param name The name of the table.
|
||||||
|
* @param embeddings An embedding function to use on this Table
|
||||||
|
*/
|
||||||
|
async openTable<T> (name: string, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
|
||||||
|
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
|
||||||
const tbl = await databaseOpenTable.call(this._db, name)
|
const tbl = await databaseOpenTable.call(this._db, name)
|
||||||
return new Table(tbl, name)
|
if (embeddings !== undefined) {
|
||||||
|
return new Table(tbl, name, embeddings)
|
||||||
|
} else {
|
||||||
|
return new Table(tbl, name)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table> {
|
/**
|
||||||
await tableCreate.call(this._db, name, await fromRecordsToBuffer(data))
|
* Creates a new Table and initialize it with new data.
|
||||||
return await this.openTable(name)
|
*
|
||||||
|
* @param name The name of the table.
|
||||||
|
* @param data Non-empty Array of Records to be inserted into the Table
|
||||||
|
*/
|
||||||
|
|
||||||
|
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table>
|
||||||
|
/**
|
||||||
|
* Creates a new Table and initialize it with new data.
|
||||||
|
*
|
||||||
|
* @param name The name of the table.
|
||||||
|
* @param data Non-empty Array of Records to be inserted into the Table
|
||||||
|
* @param embeddings An embedding function to use on this Table
|
||||||
|
*/
|
||||||
|
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
|
||||||
|
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
|
||||||
|
const tbl = await tableCreate.call(this._db, name, await fromRecordsToBuffer(data, embeddings))
|
||||||
|
if (embeddings !== undefined) {
|
||||||
|
return new Table(tbl, name, embeddings)
|
||||||
|
} else {
|
||||||
|
return new Table(tbl, name)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async createTableArrow (name: string, table: ArrowTable): Promise<Table> {
|
async createTableArrow (name: string, table: ArrowTable): Promise<Table> {
|
||||||
@@ -75,16 +109,22 @@ export class Connection {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
export class Table<T = number[]> {
|
||||||
* A table in a LanceDB database.
|
|
||||||
*/
|
|
||||||
export class Table {
|
|
||||||
private readonly _tbl: any
|
private readonly _tbl: any
|
||||||
private readonly _name: string
|
private readonly _name: string
|
||||||
|
private readonly _embeddings?: EmbeddingFunction<T>
|
||||||
|
|
||||||
constructor (tbl: any, name: string) {
|
constructor (tbl: any, name: string)
|
||||||
|
/**
|
||||||
|
* @param tbl
|
||||||
|
* @param name
|
||||||
|
* @param embeddings An embedding function to use when interacting with this table
|
||||||
|
*/
|
||||||
|
constructor (tbl: any, name: string, embeddings: EmbeddingFunction<T>)
|
||||||
|
constructor (tbl: any, name: string, embeddings?: EmbeddingFunction<T>) {
|
||||||
this._tbl = tbl
|
this._tbl = tbl
|
||||||
this._name = name
|
this._name = name
|
||||||
|
this._embeddings = embeddings
|
||||||
}
|
}
|
||||||
|
|
||||||
get name (): string {
|
get name (): string {
|
||||||
@@ -92,72 +132,173 @@ export class Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a search query to find the nearest neighbors of the given query vector.
|
* Creates a search query to find the nearest neighbors of the given search term
|
||||||
* @param queryVector The query vector.
|
* @param query The query search term
|
||||||
*/
|
*/
|
||||||
search (queryVector: number[]): Query {
|
search (query: T): Query {
|
||||||
|
let queryVector: number[]
|
||||||
|
if (this._embeddings !== undefined) {
|
||||||
|
queryVector = this._embeddings.embed([query])[0]
|
||||||
|
} else {
|
||||||
|
queryVector = query as number[]
|
||||||
|
}
|
||||||
return new Query(this._tbl, queryVector)
|
return new Query(this._tbl, queryVector)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Insert records into this Table
|
* Insert records into this Table.
|
||||||
* @param data Records to be inserted into the Table
|
|
||||||
*
|
*
|
||||||
* @param mode Append / Overwrite existing records. Default: Append
|
* @param data Records to be inserted into the Table
|
||||||
* @return The number of rows added to the table
|
* @return The number of rows added to the table
|
||||||
*/
|
*/
|
||||||
async add (data: Array<Record<string, unknown>>): Promise<number> {
|
async add (data: Array<Record<string, unknown>>): Promise<number> {
|
||||||
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString())
|
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Append.toString())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Insert records into this Table, replacing its contents.
|
||||||
|
*
|
||||||
|
* @param data Records to be inserted into the Table
|
||||||
|
* @return The number of rows added to the table
|
||||||
|
*/
|
||||||
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
|
async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
|
||||||
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString())
|
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString())
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an ANN index on this Table vector index.
|
||||||
|
*
|
||||||
|
* @param indexParams The parameters of this Index, @see VectorIndexParams.
|
||||||
|
*/
|
||||||
|
async create_index (indexParams: VectorIndexParams): Promise<any> {
|
||||||
|
return tableCreateVectorIndex.call(this._tbl, indexParams)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface IvfPQIndexConfig {
|
||||||
|
/**
|
||||||
|
* The column to be indexed
|
||||||
|
*/
|
||||||
|
column?: string
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A unique name for the index
|
||||||
|
*/
|
||||||
|
index_name?: string
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Metric type, L2 or Cosine
|
||||||
|
*/
|
||||||
|
metric_type?: MetricType
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of partitions this index
|
||||||
|
*/
|
||||||
|
num_partitions?: number
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The max number of iterations for kmeans training.
|
||||||
|
*/
|
||||||
|
max_iters?: number
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Train as optimized product quantization.
|
||||||
|
*/
|
||||||
|
use_opq?: boolean
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of subvectors to build PQ code
|
||||||
|
*/
|
||||||
|
num_sub_vectors?: number
|
||||||
|
/**
|
||||||
|
* The number of bits to present one PQ centroid.
|
||||||
|
*/
|
||||||
|
num_bits?: number
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Max number of iterations to train OPQ, if `use_opq` is true.
|
||||||
|
*/
|
||||||
|
max_opq_iters?: number
|
||||||
|
|
||||||
|
type: 'ivf_pq'
|
||||||
|
}
|
||||||
|
|
||||||
|
export type VectorIndexParams = IvfPQIndexConfig
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A builder for nearest neighbor queries for LanceDB.
|
* A builder for nearest neighbor queries for LanceDB.
|
||||||
*/
|
*/
|
||||||
export class Query {
|
export class Query {
|
||||||
private readonly _tbl: any
|
private readonly _tbl: any
|
||||||
private readonly _query_vector: number[]
|
private readonly _queryVector: number[]
|
||||||
private _limit: number
|
private _limit: number
|
||||||
private readonly _refine_factor?: number
|
private _refineFactor?: number
|
||||||
private readonly _nprobes: number
|
private _nprobes: number
|
||||||
private readonly _columns?: string[]
|
private readonly _columns?: string[]
|
||||||
private _filter?: string
|
private _filter?: string
|
||||||
private readonly _metric = 'L2'
|
private _metricType?: MetricType
|
||||||
|
|
||||||
constructor (tbl: any, queryVector: number[]) {
|
constructor (tbl: any, queryVector: number[]) {
|
||||||
this._tbl = tbl
|
this._tbl = tbl
|
||||||
this._query_vector = queryVector
|
this._queryVector = queryVector
|
||||||
this._limit = 10
|
this._limit = 10
|
||||||
this._nprobes = 20
|
this._nprobes = 20
|
||||||
this._refine_factor = undefined
|
this._refineFactor = undefined
|
||||||
this._columns = undefined
|
this._columns = undefined
|
||||||
this._filter = undefined
|
this._filter = undefined
|
||||||
|
this._metricType = undefined
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/***
|
||||||
|
* Sets the number of results that will be returned
|
||||||
|
* @param value number of results
|
||||||
|
*/
|
||||||
limit (value: number): Query {
|
limit (value: number): Query {
|
||||||
this._limit = value
|
this._limit = value
|
||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Refine the results by reading extra elements and re-ranking them in memory.
|
||||||
|
* @param value refine factor to use in this query.
|
||||||
|
*/
|
||||||
|
refineFactor (value: number): Query {
|
||||||
|
this._refineFactor = value
|
||||||
|
return this
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of probes used. A higher number makes search more accurate but also slower.
|
||||||
|
* @param value The number of probes used.
|
||||||
|
*/
|
||||||
|
nprobes (value: number): Query {
|
||||||
|
this._nprobes = value
|
||||||
|
return this
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filter statement to be applied to this query.
|
||||||
|
* @param value A filter in the same format used by a sql WHERE clause.
|
||||||
|
*/
|
||||||
filter (value: string): Query {
|
filter (value: string): Query {
|
||||||
this._filter = value
|
this._filter = value
|
||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute the query and return the results as an Array of Objects
|
* The MetricType used for this Query.
|
||||||
*/
|
* @param value The metric to the. @see MetricType for the different options
|
||||||
|
*/
|
||||||
|
metricType (value: MetricType): Query {
|
||||||
|
this._metricType = value
|
||||||
|
return this
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute the query and return the results as an Array of Objects
|
||||||
|
*/
|
||||||
async execute<T = Record<string, unknown>> (): Promise<T[]> {
|
async execute<T = Record<string, unknown>> (): Promise<T[]> {
|
||||||
let buffer
|
const buffer = await tableSearch.call(this._tbl, this)
|
||||||
if (this._filter != null) {
|
|
||||||
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit, this._filter)
|
|
||||||
} else {
|
|
||||||
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit)
|
|
||||||
}
|
|
||||||
const data = tableFromIPC(buffer)
|
const data = tableFromIPC(buffer)
|
||||||
return data.toArray().map((entry: Record<string, unknown>) => {
|
return data.toArray().map((entry: Record<string, unknown>) => {
|
||||||
const newObject: Record<string, unknown> = {}
|
const newObject: Record<string, unknown> = {}
|
||||||
@@ -177,3 +318,33 @@ export enum WriteMode {
|
|||||||
Overwrite = 'overwrite',
|
Overwrite = 'overwrite',
|
||||||
Append = 'append'
|
Append = 'append'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An embedding function that automatically creates vector representation for a given column.
|
||||||
|
*/
|
||||||
|
export interface EmbeddingFunction<T> {
|
||||||
|
/**
|
||||||
|
* The name of the column that will be used as input for the Embedding Function.
|
||||||
|
*/
|
||||||
|
sourceColumn: string
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a vector representation for the given values.
|
||||||
|
*/
|
||||||
|
embed: (data: T[]) => number[][]
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Distance metrics type.
|
||||||
|
*/
|
||||||
|
export enum MetricType {
|
||||||
|
/**
|
||||||
|
* Euclidean distance
|
||||||
|
*/
|
||||||
|
L2 = 'l2',
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cosine distance
|
||||||
|
*/
|
||||||
|
Cosine = 'cosine'
|
||||||
|
}
|
||||||
|
|||||||
52
node/src/test/io.ts
Normal file
52
node/src/test/io.ts
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// IO tests
|
||||||
|
|
||||||
|
import { describe } from 'mocha'
|
||||||
|
import { assert } from 'chai'
|
||||||
|
|
||||||
|
import * as lancedb from '../index'
|
||||||
|
|
||||||
|
describe('LanceDB S3 client', function () {
|
||||||
|
if (process.env.TEST_S3_BASE_URL != null) {
|
||||||
|
const baseUri = process.env.TEST_S3_BASE_URL
|
||||||
|
it('should have a valid url', async function () {
|
||||||
|
const uri = `${baseUri}/valid_url`
|
||||||
|
const table = await createTestDB(uri, 2, 20)
|
||||||
|
const con = await lancedb.connect(uri)
|
||||||
|
assert.equal(con.uri, uri)
|
||||||
|
|
||||||
|
const results = await table.search([0.1, 0.3]).limit(5).execute()
|
||||||
|
assert.equal(results.length, 5)
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
describe.skip('Skip S3 test', function () {})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
async function createTestDB (uri: string, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
|
||||||
|
const con = await lancedb.connect(uri)
|
||||||
|
|
||||||
|
const data = []
|
||||||
|
for (let i = 0; i < numRows; i++) {
|
||||||
|
const vector = []
|
||||||
|
for (let j = 0; j < numDimensions; j++) {
|
||||||
|
vector.push(i + (j * 0.1))
|
||||||
|
}
|
||||||
|
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
|
||||||
|
}
|
||||||
|
|
||||||
|
return await con.createTable('vectors', data)
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@ import { assert } from 'chai'
|
|||||||
import { track } from 'temp'
|
import { track } from 'temp'
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
import * as lancedb from '../index'
|
||||||
|
import { type EmbeddingFunction, MetricType, Query } from '../index'
|
||||||
|
|
||||||
describe('LanceDB client', function () {
|
describe('LanceDB client', function () {
|
||||||
describe('when creating a connection to lancedb', function () {
|
describe('when creating a connection to lancedb', function () {
|
||||||
@@ -67,7 +68,7 @@ describe('LanceDB client', function () {
|
|||||||
const uri = await createTestDB()
|
const uri = await createTestDB()
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
const table = await con.openTable('vectors')
|
const table = await con.openTable('vectors')
|
||||||
const results = await table.search([0.1, 0.3]).filter('id == 2').execute()
|
const results = await table.search([0.1, 0.1]).filter('id == 2').execute()
|
||||||
assert.equal(results.length, 1)
|
assert.equal(results.length, 1)
|
||||||
assert.equal(results[0].id, 2)
|
assert.equal(results[0].id, 2)
|
||||||
})
|
})
|
||||||
@@ -96,8 +97,8 @@ describe('LanceDB client', function () {
|
|||||||
const con = await lancedb.connect(dir)
|
const con = await lancedb.connect(dir)
|
||||||
|
|
||||||
const data = [
|
const data = [
|
||||||
{ id: 1, vector: [0.1, 0.2], price: 10 },
|
{ id: 1, vector: [0.1, 0.2], price: 10, name: 'a' },
|
||||||
{ id: 2, vector: [1.1, 1.2], price: 50 }
|
{ id: 2, vector: [1.1, 1.2], price: 50, name: 'b' }
|
||||||
]
|
]
|
||||||
|
|
||||||
const table = await con.createTable('vectors', data)
|
const table = await con.createTable('vectors', data)
|
||||||
@@ -105,8 +106,8 @@ describe('LanceDB client', function () {
|
|||||||
assert.equal(results.length, 2)
|
assert.equal(results.length, 2)
|
||||||
|
|
||||||
const dataAdd = [
|
const dataAdd = [
|
||||||
{ id: 3, vector: [2.1, 2.2], price: 10 },
|
{ id: 3, vector: [2.1, 2.2], price: 10, name: 'c' },
|
||||||
{ id: 4, vector: [3.1, 3.2], price: 50 }
|
{ id: 4, vector: [3.1, 3.2], price: 50, name: 'd' }
|
||||||
]
|
]
|
||||||
await table.add(dataAdd)
|
await table.add(dataAdd)
|
||||||
const resultsAdd = await table.search([0.1, 0.3]).execute()
|
const resultsAdd = await table.search([0.1, 0.3]).execute()
|
||||||
@@ -130,16 +131,76 @@ describe('LanceDB client', function () {
|
|||||||
assert.equal(resultsAdd.length, 2)
|
assert.equal(resultsAdd.length, 2)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
describe('when creating a vector index', function () {
|
||||||
|
it('overwrite all records in a table', async function () {
|
||||||
|
const uri = await createTestDB(32, 300)
|
||||||
|
const con = await lancedb.connect(uri)
|
||||||
|
const table = await con.openTable('vectors')
|
||||||
|
await table.create_index({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2 })
|
||||||
|
}).timeout(10_000) // Timeout is high partially because GH macos runner is pretty slow
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('when using a custom embedding function', function () {
|
||||||
|
class TextEmbedding implements EmbeddingFunction<string> {
|
||||||
|
sourceColumn: string
|
||||||
|
|
||||||
|
constructor (targetColumn: string) {
|
||||||
|
this.sourceColumn = targetColumn
|
||||||
|
}
|
||||||
|
|
||||||
|
_embedding_map = new Map<string, number[]>([
|
||||||
|
['foo', [2.1, 2.2]],
|
||||||
|
['bar', [3.1, 3.2]]
|
||||||
|
])
|
||||||
|
|
||||||
|
embed (data: string[]): number[][] {
|
||||||
|
return data.map(datum => this._embedding_map.get(datum) ?? [0.0, 0.0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
it('should encode the original data into embeddings', async function () {
|
||||||
|
const dir = await track().mkdir('lancejs')
|
||||||
|
const con = await lancedb.connect(dir)
|
||||||
|
const embeddings = new TextEmbedding('name')
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ price: 10, name: 'foo' },
|
||||||
|
{ price: 50, name: 'bar' }
|
||||||
|
]
|
||||||
|
const table = await con.createTable('vectors', data, embeddings)
|
||||||
|
const results = await table.search('foo').execute()
|
||||||
|
assert.equal(results.length, 2)
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
async function createTestDB (): Promise<string> {
|
describe('Query object', function () {
|
||||||
|
it('sets custom parameters', async function () {
|
||||||
|
const query = new Query(undefined, [0.1, 0.3])
|
||||||
|
.limit(1)
|
||||||
|
.metricType(MetricType.Cosine)
|
||||||
|
.refineFactor(100)
|
||||||
|
.nprobes(20) as Record<string, any>
|
||||||
|
assert.equal(query._limit, 1)
|
||||||
|
assert.equal(query._metricType, MetricType.Cosine)
|
||||||
|
assert.equal(query._refineFactor, 100)
|
||||||
|
assert.equal(query._nprobes, 20)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
async function createTestDB (numDimensions: number = 2, numRows: number = 2): Promise<string> {
|
||||||
const dir = await track().mkdir('lancejs')
|
const dir = await track().mkdir('lancejs')
|
||||||
const con = await lancedb.connect(dir)
|
const con = await lancedb.connect(dir)
|
||||||
|
|
||||||
const data = [
|
const data = []
|
||||||
{ id: 1, vector: [0.1, 0.2], name: 'foo', price: 10, is_active: true },
|
for (let i = 0; i < numRows; i++) {
|
||||||
{ id: 2, vector: [1.1, 1.2], name: 'bar', price: 50, is_active: false }
|
const vector = []
|
||||||
]
|
for (let j = 0; j < numDimensions; j++) {
|
||||||
|
vector.push(i + (j * 0.1))
|
||||||
|
}
|
||||||
|
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
|
||||||
|
}
|
||||||
|
|
||||||
await con.createTable('vectors', data)
|
await con.createTable('vectors', data)
|
||||||
return dir
|
return dir
|
||||||
|
|||||||
128
python/lancedb/fts.py
Normal file
128
python/lancedb/fts.py
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Full text search index using tantivy-py"""
|
||||||
|
import os
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
try:
|
||||||
|
import tantivy
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
|
||||||
|
)
|
||||||
|
|
||||||
|
from .table import LanceTable
|
||||||
|
|
||||||
|
|
||||||
|
def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
|
||||||
|
"""
|
||||||
|
Create a new Index (not populated)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
index_path : str
|
||||||
|
Path to the index directory
|
||||||
|
text_fields : List[str]
|
||||||
|
List of text fields to index
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
index : tantivy.Index
|
||||||
|
The index object (not yet populated)
|
||||||
|
"""
|
||||||
|
# Declaring our schema.
|
||||||
|
schema_builder = tantivy.SchemaBuilder()
|
||||||
|
# special field that we'll populate with row_id
|
||||||
|
schema_builder.add_integer_field("doc_id", stored=True)
|
||||||
|
# data fields
|
||||||
|
for name in text_fields:
|
||||||
|
schema_builder.add_text_field(name, stored=True)
|
||||||
|
schema = schema_builder.build()
|
||||||
|
os.makedirs(index_path, exist_ok=True)
|
||||||
|
index = tantivy.Index(schema, path=index_path)
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
|
||||||
|
"""
|
||||||
|
Populate an index with data from a LanceTable
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
index : tantivy.Index
|
||||||
|
The index object
|
||||||
|
table : LanceTable
|
||||||
|
The table to index
|
||||||
|
fields : List[str]
|
||||||
|
List of fields to index
|
||||||
|
"""
|
||||||
|
# first check the fields exist and are string or large string type
|
||||||
|
for name in fields:
|
||||||
|
f = table.schema.field(name) # raises KeyError if not found
|
||||||
|
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
|
||||||
|
raise TypeError(f"Field {name} is not a string type")
|
||||||
|
|
||||||
|
# create a tantivy writer
|
||||||
|
writer = index.writer()
|
||||||
|
# write data into index
|
||||||
|
dataset = table.to_lance()
|
||||||
|
row_id = 0
|
||||||
|
for b in dataset.to_batches(columns=fields):
|
||||||
|
for i in range(b.num_rows):
|
||||||
|
doc = tantivy.Document()
|
||||||
|
doc.add_integer("doc_id", row_id)
|
||||||
|
for name in fields:
|
||||||
|
doc.add_text(name, b[name][i].as_py())
|
||||||
|
writer.add_document(doc)
|
||||||
|
row_id += 1
|
||||||
|
# commit changes
|
||||||
|
writer.commit()
|
||||||
|
return row_id
|
||||||
|
|
||||||
|
|
||||||
|
def search_index(
|
||||||
|
index: tantivy.Index, query: str, limit: int = 10
|
||||||
|
) -> Tuple[Tuple[int], Tuple[float]]:
|
||||||
|
"""
|
||||||
|
Search an index for a query
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
index : tantivy.Index
|
||||||
|
The index object
|
||||||
|
query : str
|
||||||
|
The query string
|
||||||
|
limit : int
|
||||||
|
The maximum number of results to return
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ids_and_score: list[tuple[int], tuple[float]]
|
||||||
|
A tuple of two tuples, the first containing the document ids
|
||||||
|
and the second containing the scores
|
||||||
|
"""
|
||||||
|
searcher = index.searcher()
|
||||||
|
query = index.parse_query(query)
|
||||||
|
# get top results
|
||||||
|
results = searcher.search(query, limit)
|
||||||
|
return tuple(
|
||||||
|
zip(
|
||||||
|
*[
|
||||||
|
(searcher.doc(doc_address)["doc_id"][0], score)
|
||||||
|
for score, doc_address in results.hits
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -14,6 +14,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
from .common import VECTOR_COLUMN_NAME
|
from .common import VECTOR_COLUMN_NAME
|
||||||
|
|
||||||
@@ -131,7 +132,6 @@ class LanceQueryBuilder:
|
|||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
ds = self._table.to_lance()
|
ds = self._table.to_lance()
|
||||||
# TODO indexed search
|
|
||||||
tbl = ds.to_table(
|
tbl = ds.to_table(
|
||||||
columns=self._columns,
|
columns=self._columns,
|
||||||
filter=self._where,
|
filter=self._where,
|
||||||
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
return tbl.to_pandas()
|
return tbl.to_pandas()
|
||||||
|
|
||||||
|
|
||||||
|
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||||
|
def to_df(self) -> pd.DataFrame:
|
||||||
|
try:
|
||||||
|
import tantivy
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
|
||||||
|
)
|
||||||
|
|
||||||
|
from .fts import search_index
|
||||||
|
|
||||||
|
# get the index path
|
||||||
|
index_path = self._table._get_fts_index_path()
|
||||||
|
# open the index
|
||||||
|
index = tantivy.Index.open(index_path)
|
||||||
|
# get the scores and doc ids
|
||||||
|
row_ids, scores = search_index(index, self._query, self._limit)
|
||||||
|
scores = pa.array(scores)
|
||||||
|
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
||||||
|
output_tbl = output_tbl.append_column("score", scores)
|
||||||
|
return output_tbl.to_pandas()
|
||||||
|
|||||||
@@ -14,7 +14,9 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
import lance
|
import lance
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -24,7 +26,8 @@ from lance import LanceDataset
|
|||||||
from lance.vector import vec_to_table
|
from lance.vector import vec_to_table
|
||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
from .query import LanceQueryBuilder
|
from .query import LanceFtsQueryBuilder, LanceQueryBuilder
|
||||||
|
from .util import get_uri_scheme
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_data(data, schema):
|
def _sanitize_data(data, schema):
|
||||||
@@ -130,6 +133,27 @@ class LanceTable:
|
|||||||
)
|
)
|
||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
|
|
||||||
|
def create_fts_index(self, field_names: Union[str, List[str]]):
|
||||||
|
"""Create a full-text search index on the table.
|
||||||
|
|
||||||
|
Warning - this API is highly experimental and is highly likely to change
|
||||||
|
in the future.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
field_names: str or list of str
|
||||||
|
The name(s) of the field to index.
|
||||||
|
"""
|
||||||
|
from .fts import create_index, populate_index
|
||||||
|
|
||||||
|
if isinstance(field_names, str):
|
||||||
|
field_names = [field_names]
|
||||||
|
index = create_index(self._get_fts_index_path(), field_names)
|
||||||
|
populate_index(index, self, field_names)
|
||||||
|
|
||||||
|
def _get_fts_index_path(self):
|
||||||
|
return os.path.join(self._dataset_uri, "_indices", "tantivy")
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _dataset(self) -> LanceDataset:
|
def _dataset(self) -> LanceDataset:
|
||||||
return lance.dataset(self._dataset_uri, version=self._version)
|
return lance.dataset(self._dataset_uri, version=self._version)
|
||||||
@@ -158,7 +182,7 @@ class LanceTable:
|
|||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
return len(self)
|
return len(self)
|
||||||
|
|
||||||
def search(self, query: VEC) -> LanceQueryBuilder:
|
def search(self, query: Union[VEC, str]) -> LanceQueryBuilder:
|
||||||
"""Create a search query to find the nearest neighbors
|
"""Create a search query to find the nearest neighbors
|
||||||
of the given query vector.
|
of the given query vector.
|
||||||
|
|
||||||
@@ -174,6 +198,10 @@ class LanceTable:
|
|||||||
and also the "score" column which is the distance between the query
|
and also the "score" column which is the distance between the query
|
||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(query, str):
|
||||||
|
# fts
|
||||||
|
return LanceFtsQueryBuilder(self, query)
|
||||||
|
|
||||||
if isinstance(query, list):
|
if isinstance(query, list):
|
||||||
query = np.array(query)
|
query = np.array(query)
|
||||||
if isinstance(query, np.ndarray):
|
if isinstance(query, np.ndarray):
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.1.2"
|
version = "0.1.4"
|
||||||
dependencies = ["pylance>=0.4.6", "ratelimiter", "retry", "tqdm"]
|
dependencies = ["pylance>=0.4.17", "ratelimiter", "retry", "tqdm"]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
|
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ import sys
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
from lancedb.embeddings import with_embeddings
|
from lancedb.embeddings import with_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
84
python/tests/test_fts.py
Normal file
84
python/tests/test_fts.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
import lancedb.fts
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
import tantivy
|
||||||
|
|
||||||
|
import lancedb as ldb
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def table(tmp_path) -> ldb.table.LanceTable:
|
||||||
|
db = ldb.connect(tmp_path)
|
||||||
|
vectors = [np.random.randn(128) for _ in range(100)]
|
||||||
|
|
||||||
|
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
|
||||||
|
verbs = ("runs", "hits", "jumps", "drives", "barfs")
|
||||||
|
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
|
||||||
|
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
|
||||||
|
text = [
|
||||||
|
" ".join(
|
||||||
|
[
|
||||||
|
nouns[random.randrange(0, 5)],
|
||||||
|
verbs[random.randrange(0, 5)],
|
||||||
|
adv[random.randrange(0, 5)],
|
||||||
|
adj[random.randrange(0, 5)],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for _ in range(100)
|
||||||
|
]
|
||||||
|
table = db.create_table(
|
||||||
|
"test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
|
||||||
|
)
|
||||||
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_index(tmp_path):
|
||||||
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||||
|
assert isinstance(index, tantivy.Index)
|
||||||
|
assert os.path.exists(str(tmp_path / "index"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_populate_index(tmp_path, table):
|
||||||
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||||
|
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_index(tmp_path, table):
|
||||||
|
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
|
||||||
|
ldb.fts.populate_index(index, table, ["text"])
|
||||||
|
index.reload()
|
||||||
|
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert len(results[0]) == 10 # row_ids
|
||||||
|
assert len(results[1]) == 10 # scores
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_index_from_table(tmp_path, table):
|
||||||
|
table.create_fts_index("text")
|
||||||
|
df = table.search("puppy").limit(10).select(["text"]).to_df()
|
||||||
|
assert len(df) == 10
|
||||||
|
assert "text" in df.columns
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_index_multiple_columns(tmp_path, table):
|
||||||
|
table.create_fts_index(["text", "text2"])
|
||||||
|
df = table.search("puppy").limit(10).to_df()
|
||||||
|
assert len(df) == 10
|
||||||
|
assert "text" in df.columns
|
||||||
|
assert "text2" in df.columns
|
||||||
@@ -17,7 +17,6 @@ import pandas as pd
|
|||||||
import pandas.testing as tm
|
import pandas.testing as tm
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from lancedb.query import LanceQueryBuilder
|
from lancedb.query import LanceQueryBuilder
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ from pathlib import Path
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from lancedb.table import LanceTable
|
from lancedb.table import LanceTable
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ arrow-ipc = "37.0"
|
|||||||
arrow-schema = "37.0"
|
arrow-schema = "37.0"
|
||||||
once_cell = "1"
|
once_cell = "1"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
lance = "0.4.3"
|
lance = "0.4.17"
|
||||||
vectordb = { path = "../../vectordb" }
|
vectordb = { path = "../../vectordb" }
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }
|
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }
|
||||||
|
|||||||
15
rust/ffi/node/src/index.rs
Normal file
15
rust/ffi/node/src/index.rs
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
pub mod vector;
|
||||||
128
rust/ffi/node/src/index/vector.rs
Normal file
128
rust/ffi/node/src/index/vector.rs
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
|
use lance::index::vector::ivf::IvfBuildParams;
|
||||||
|
use lance::index::vector::pq::PQBuildParams;
|
||||||
|
use lance::index::vector::MetricType;
|
||||||
|
use neon::context::FunctionContext;
|
||||||
|
use neon::prelude::*;
|
||||||
|
|
||||||
|
use vectordb::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
|
||||||
|
|
||||||
|
use crate::{runtime, JsTable};
|
||||||
|
|
||||||
|
pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let index_params = cx.argument::<JsObject>(0)?;
|
||||||
|
let index_params_builder = get_index_params_builder(&mut cx, index_params).unwrap();
|
||||||
|
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
let channel = cx.channel();
|
||||||
|
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
let table = js_table.table.clone();
|
||||||
|
|
||||||
|
rt.block_on(async move {
|
||||||
|
let add_result = table
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.create_index(&index_params_builder)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
add_result
|
||||||
|
.map(|_| cx.undefined())
|
||||||
|
.or_else(|err| cx.throw_error(err.to_string()))
|
||||||
|
});
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_index_params_builder(
|
||||||
|
cx: &mut FunctionContext,
|
||||||
|
obj: Handle<JsObject>,
|
||||||
|
) -> Result<impl VectorIndexBuilder, String> {
|
||||||
|
let idx_type = obj
|
||||||
|
.get::<JsString, _, _>(cx, "type")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.value(cx);
|
||||||
|
|
||||||
|
match idx_type.as_str() {
|
||||||
|
"ivf_pq" => {
|
||||||
|
let mut index_builder: IvfPQIndexBuilder = IvfPQIndexBuilder::new();
|
||||||
|
let mut pq_params = PQBuildParams::default();
|
||||||
|
|
||||||
|
obj.get_opt::<JsString, _, _>(cx, "column")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| index_builder.column(s.value(cx)));
|
||||||
|
|
||||||
|
obj.get_opt::<JsString, _, _>(cx, "index_name")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| index_builder.index_name(s.value(cx)));
|
||||||
|
|
||||||
|
obj.get_opt::<JsString, _, _>(cx, "metric_type")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| MetricType::try_from(s.value(cx).as_str()))
|
||||||
|
.map(|mt| {
|
||||||
|
let metric_type = mt.unwrap();
|
||||||
|
index_builder.metric_type(metric_type);
|
||||||
|
pq_params.metric_type = metric_type;
|
||||||
|
});
|
||||||
|
|
||||||
|
let num_partitions = obj
|
||||||
|
.get_opt::<JsNumber, _, _>(cx, "num_partitions")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| s.value(cx) as usize);
|
||||||
|
|
||||||
|
let max_iters = obj
|
||||||
|
.get_opt::<JsNumber, _, _>(cx, "max_iters")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| s.value(cx) as usize);
|
||||||
|
|
||||||
|
num_partitions.map(|np| {
|
||||||
|
let max_iters = max_iters.unwrap_or(50);
|
||||||
|
let ivf_params = IvfBuildParams {
|
||||||
|
num_partitions: np,
|
||||||
|
max_iters,
|
||||||
|
};
|
||||||
|
index_builder.ivf_params(ivf_params)
|
||||||
|
});
|
||||||
|
|
||||||
|
obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| pq_params.use_opq = s.value(cx));
|
||||||
|
|
||||||
|
obj.get_opt::<JsNumber, _, _>(cx, "num_sub_vectors")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| pq_params.num_sub_vectors = s.value(cx) as usize);
|
||||||
|
|
||||||
|
obj.get_opt::<JsNumber, _, _>(cx, "num_bits")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| pq_params.num_bits = s.value(cx) as usize);
|
||||||
|
|
||||||
|
obj.get_opt::<JsNumber, _, _>(cx, "max_iters")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| pq_params.max_iters = s.value(cx) as usize);
|
||||||
|
|
||||||
|
obj.get_opt::<JsNumber, _, _>(cx, "max_opq_iters")
|
||||||
|
.map_err(|t| t.to_string())?
|
||||||
|
.map(|s| pq_params.max_opq_iters = s.value(cx) as usize);
|
||||||
|
|
||||||
|
Ok(index_builder)
|
||||||
|
}
|
||||||
|
t => Err(format!("{} is not a valid index type", t).to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -13,6 +13,7 @@
|
|||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::convert::TryFrom;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
@@ -21,6 +22,7 @@ use arrow_ipc::writer::FileWriter;
|
|||||||
use futures::{TryFutureExt, TryStreamExt};
|
use futures::{TryFutureExt, TryStreamExt};
|
||||||
use lance::arrow::RecordBatchBuffer;
|
use lance::arrow::RecordBatchBuffer;
|
||||||
use lance::dataset::WriteMode;
|
use lance::dataset::WriteMode;
|
||||||
|
use lance::index::vector::MetricType;
|
||||||
use neon::prelude::*;
|
use neon::prelude::*;
|
||||||
use neon::types::buffer::TypedArray;
|
use neon::types::buffer::TypedArray;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
@@ -34,17 +36,18 @@ use crate::arrow::arrow_buffer_to_record_batch;
|
|||||||
|
|
||||||
mod arrow;
|
mod arrow;
|
||||||
mod convert;
|
mod convert;
|
||||||
|
mod index;
|
||||||
|
|
||||||
struct JsDatabase {
|
struct JsDatabase {
|
||||||
database: Arc<Database>,
|
database: Arc<Database>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Finalize for JsDatabase {}
|
||||||
|
|
||||||
struct JsTable {
|
struct JsTable {
|
||||||
table: Arc<Mutex<Table>>,
|
table: Arc<Mutex<Table>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Finalize for JsDatabase {}
|
|
||||||
|
|
||||||
impl Finalize for JsTable {}
|
impl Finalize for JsTable {}
|
||||||
|
|
||||||
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
||||||
@@ -53,23 +56,46 @@ fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
|
|||||||
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
|
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn database_new(mut cx: FunctionContext) -> JsResult<JsBox<JsDatabase>> {
|
fn database_new(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let path = cx.argument::<JsString>(0)?.value(&mut cx);
|
let path = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
let db = JsDatabase {
|
|
||||||
database: Arc::new(Database::connect(path).or_else(|err| cx.throw_error(err.to_string()))?),
|
let rt = runtime(&mut cx)?;
|
||||||
};
|
let channel = cx.channel();
|
||||||
Ok(cx.boxed(db))
|
let (deferred, promise) = cx.promise();
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let database = Database::connect(&path).await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let db = JsDatabase {
|
||||||
|
database: Arc::new(database.or_else(|err| cx.throw_error(err.to_string()))?),
|
||||||
|
};
|
||||||
|
Ok(cx.boxed(db))
|
||||||
|
});
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsArray> {
|
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let db = cx
|
let db = cx
|
||||||
.this()
|
.this()
|
||||||
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
|
||||||
let tables = db
|
|
||||||
.database
|
let rt = runtime(&mut cx)?;
|
||||||
.table_names()
|
let (deferred, promise) = cx.promise();
|
||||||
.or_else(|err| cx.throw_error(err.to_string()))?;
|
let channel = cx.channel();
|
||||||
convert::vec_str_to_array(&tables, &mut cx)
|
let database = db.database.clone();
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let tables_rst = database.table_names().await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let tables = tables_rst.or_else(|err| cx.throw_error(err.to_string()))?;
|
||||||
|
let table_names = convert::vec_str_to_array(&tables, &mut cx);
|
||||||
|
table_names
|
||||||
|
});
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
@@ -84,10 +110,12 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
|
|
||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let table_rst = database.open_table(table_name).await;
|
let table_rst = database.open_table(&table_name).await;
|
||||||
|
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
|
let table = Arc::new(Mutex::new(
|
||||||
|
table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
|
||||||
|
));
|
||||||
Ok(cx.boxed(JsTable { table }))
|
Ok(cx.boxed(JsTable { table }))
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -96,15 +124,32 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
|
|
||||||
fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
let query_vector = cx.argument::<JsArray>(0)?; //. .as_value(&mut cx);
|
let query_obj = cx.argument::<JsObject>(0)?;
|
||||||
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx);
|
|
||||||
let filter = cx.argument_opt(2).map(|f| f.downcast_or_throw::<JsString, _>(&mut cx).unwrap().value(&mut cx));
|
let limit = query_obj
|
||||||
|
.get::<JsNumber, _, _>(&mut cx, "_limit")?
|
||||||
|
.value(&mut cx);
|
||||||
|
let filter = query_obj
|
||||||
|
.get_opt::<JsString, _, _>(&mut cx, "_filter")?
|
||||||
|
.map(|s| s.value(&mut cx));
|
||||||
|
let refine_factor = query_obj
|
||||||
|
.get_opt::<JsNumber, _, _>(&mut cx, "_refineFactor")?
|
||||||
|
.map(|s| s.value(&mut cx))
|
||||||
|
.map(|i| i as u32);
|
||||||
|
let nprobes = query_obj
|
||||||
|
.get::<JsNumber, _, _>(&mut cx, "_nprobes")?
|
||||||
|
.value(&mut cx) as usize;
|
||||||
|
let metric_type = query_obj
|
||||||
|
.get_opt::<JsString, _, _>(&mut cx, "_metricType")?
|
||||||
|
.map(|s| s.value(&mut cx))
|
||||||
|
.map(|s| MetricType::try_from(s.as_str()).unwrap());
|
||||||
|
|
||||||
let rt = runtime(&mut cx)?;
|
let rt = runtime(&mut cx)?;
|
||||||
let channel = cx.channel();
|
let channel = cx.channel();
|
||||||
|
|
||||||
let (deferred, promise) = cx.promise();
|
let (deferred, promise) = cx.promise();
|
||||||
let table = js_table.table.clone();
|
let table = js_table.table.clone();
|
||||||
|
let query_vector = query_obj.get::<JsArray, _, _>(&mut cx, "_queryVector")?;
|
||||||
let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);
|
let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);
|
||||||
|
|
||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
@@ -113,7 +158,10 @@ fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.search(Float32Array::from(query))
|
.search(Float32Array::from(query))
|
||||||
.limit(limit as usize)
|
.limit(limit as usize)
|
||||||
.filter(filter);
|
.refine_factor(refine_factor)
|
||||||
|
.nprobes(nprobes)
|
||||||
|
.filter(filter)
|
||||||
|
.metric_type(metric_type);
|
||||||
let record_batch_stream = builder.execute();
|
let record_batch_stream = builder.execute();
|
||||||
let results = record_batch_stream
|
let results = record_batch_stream
|
||||||
.and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from))
|
.and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from))
|
||||||
@@ -161,10 +209,12 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
|
|
||||||
rt.block_on(async move {
|
rt.block_on(async move {
|
||||||
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
|
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
|
||||||
let table_rst = database.create_table(table_name, batch_reader).await;
|
let table_rst = database.create_table(&table_name, batch_reader).await;
|
||||||
|
|
||||||
deferred.settle_with(&channel, move |mut cx| {
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
|
let table = Arc::new(Mutex::new(
|
||||||
|
table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
|
||||||
|
));
|
||||||
Ok(cx.boxed(JsTable { table }))
|
Ok(cx.boxed(JsTable { table }))
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -178,9 +228,7 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
("overwrite", WriteMode::Overwrite),
|
("overwrite", WriteMode::Overwrite),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let js_table = cx
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
.this()
|
|
||||||
.downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
|
||||||
let buffer = cx.argument::<JsBuffer>(0)?;
|
let buffer = cx.argument::<JsBuffer>(0)?;
|
||||||
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||||
let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
|
let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
|
||||||
@@ -204,7 +252,6 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
|||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[neon::main]
|
#[neon::main]
|
||||||
fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
||||||
cx.export_function("databaseNew", database_new)?;
|
cx.export_function("databaseNew", database_new)?;
|
||||||
@@ -213,5 +260,9 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
|||||||
cx.export_function("tableSearch", table_search)?;
|
cx.export_function("tableSearch", table_search)?;
|
||||||
cx.export_function("tableCreate", table_create)?;
|
cx.export_function("tableCreate", table_create)?;
|
||||||
cx.export_function("tableAdd", table_add)?;
|
cx.export_function("tableAdd", table_add)?;
|
||||||
|
cx.export_function(
|
||||||
|
"tableCreateVectorIndex",
|
||||||
|
index::vector::table_create_vector_index,
|
||||||
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,9 +10,13 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arrow-array = "37.0"
|
arrow-array = "37.0"
|
||||||
|
arrow-data = "37.0"
|
||||||
arrow-schema = "37.0"
|
arrow-schema = "37.0"
|
||||||
lance = "0.4.3"
|
object_store = "0.5.6"
|
||||||
|
|
||||||
|
lance = "0.4.17"
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3.5.0"
|
tempfile = "3.5.0"
|
||||||
|
rand = { version = "0.8.3", features = ["small_rng"] }
|
||||||
|
|||||||
@@ -12,16 +12,19 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use arrow_array::RecordBatchReader;
|
|
||||||
use std::fs::create_dir_all;
|
use std::fs::create_dir_all;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
|
||||||
|
use arrow_array::RecordBatchReader;
|
||||||
|
use lance::io::object_store::ObjectStore;
|
||||||
|
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
use crate::table::Table;
|
use crate::table::Table;
|
||||||
|
|
||||||
pub struct Database {
|
pub struct Database {
|
||||||
pub(crate) path: Arc<PathBuf>,
|
object_store: ObjectStore,
|
||||||
|
|
||||||
|
pub(crate) uri: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
const LANCE_EXTENSION: &str = "lance";
|
const LANCE_EXTENSION: &str = "lance";
|
||||||
@@ -37,12 +40,17 @@ impl Database {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// * A [Database] object.
|
/// * A [Database] object.
|
||||||
pub fn connect<P: AsRef<Path>>(path: P) -> Result<Database> {
|
pub async fn connect(uri: &str) -> Result<Database> {
|
||||||
if !path.as_ref().try_exists()? {
|
let object_store = ObjectStore::new(uri).await?;
|
||||||
create_dir_all(&path)?;
|
if object_store.is_local() {
|
||||||
|
let path = Path::new(uri);
|
||||||
|
if !path.try_exists()? {
|
||||||
|
create_dir_all(&path)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(Database {
|
Ok(Database {
|
||||||
path: Arc::new(path.as_ref().to_path_buf()),
|
uri: uri.to_string(),
|
||||||
|
object_store,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,12 +59,13 @@ impl Database {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// * A [Vec<String>] with all table names.
|
/// * A [Vec<String>] with all table names.
|
||||||
pub fn table_names(&self) -> Result<Vec<String>> {
|
pub async fn table_names(&self) -> Result<Vec<String>> {
|
||||||
let f = self
|
let f = self
|
||||||
.path
|
.object_store
|
||||||
.read_dir()?
|
.read_dir("/")
|
||||||
.flatten()
|
.await?
|
||||||
.map(|dir_entry| dir_entry.path())
|
.iter()
|
||||||
|
.map(|fname| Path::new(fname))
|
||||||
.filter(|path| {
|
.filter(|path| {
|
||||||
let is_lance = path
|
let is_lance = path
|
||||||
.extension()
|
.extension()
|
||||||
@@ -76,10 +85,10 @@ impl Database {
|
|||||||
|
|
||||||
pub async fn create_table(
|
pub async fn create_table(
|
||||||
&self,
|
&self,
|
||||||
name: String,
|
name: &str,
|
||||||
batches: Box<dyn RecordBatchReader>,
|
batches: Box<dyn RecordBatchReader>,
|
||||||
) -> Result<Table> {
|
) -> Result<Table> {
|
||||||
Table::create(self.path.clone(), name, batches).await
|
Table::create(&self.uri, name, batches).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open a table in the database.
|
/// Open a table in the database.
|
||||||
@@ -90,8 +99,8 @@ impl Database {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// * A [Table] object.
|
/// * A [Table] object.
|
||||||
pub async fn open_table(&self, name: String) -> Result<Table> {
|
pub async fn open_table(&self, name: &str) -> Result<Table> {
|
||||||
Table::open(self.path.clone(), name).await
|
Table::open(&self.uri, name).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -105,10 +114,10 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_connect() {
|
async fn test_connect() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let path_buf = tmp_dir.into_path();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
let db = Database::connect(&path_buf);
|
let db = Database::connect(uri).await.unwrap();
|
||||||
|
|
||||||
assert_eq!(db.unwrap().path.as_path(), path_buf.as_path())
|
assert_eq!(db.uri, uri);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
@@ -118,10 +127,16 @@ mod tests {
|
|||||||
create_dir_all(tmp_dir.path().join("table2.lance")).unwrap();
|
create_dir_all(tmp_dir.path().join("table2.lance")).unwrap();
|
||||||
create_dir_all(tmp_dir.path().join("invalidlance")).unwrap();
|
create_dir_all(tmp_dir.path().join("invalidlance")).unwrap();
|
||||||
|
|
||||||
let db = Database::connect(&tmp_dir.into_path()).unwrap();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
let tables = db.table_names().unwrap();
|
let db = Database::connect(uri).await.unwrap();
|
||||||
|
let tables = db.table_names().await.unwrap();
|
||||||
assert_eq!(tables.len(), 2);
|
assert_eq!(tables.len(), 2);
|
||||||
assert!(tables.contains(&String::from("table1")));
|
assert!(tables.contains(&String::from("table1")));
|
||||||
assert!(tables.contains(&String::from("table2")));
|
assert!(tables.contains(&String::from("table2")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_connect_s3() {
|
||||||
|
// let db = Database::connect("s3://bucket/path/to/database").await.unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,3 +41,15 @@ impl From<lance::Error> for Error {
|
|||||||
Self::Lance(e.to_string())
|
Self::Lance(e.to_string())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<object_store::Error> for Error {
|
||||||
|
fn from(e: object_store::Error) -> Self {
|
||||||
|
Self::IO(e.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<object_store::path::Error> for Error {
|
||||||
|
fn from(e: object_store::path::Error) -> Self {
|
||||||
|
Self::IO(e.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
15
rust/vectordb/src/index.rs
Normal file
15
rust/vectordb/src/index.rs
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
pub mod vector;
|
||||||
163
rust/vectordb/src/index/vector.rs
Normal file
163
rust/vectordb/src/index/vector.rs
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
use lance::index::vector::ivf::IvfBuildParams;
|
||||||
|
use lance::index::vector::pq::PQBuildParams;
|
||||||
|
use lance::index::vector::{MetricType, VectorIndexParams};
|
||||||
|
|
||||||
|
pub trait VectorIndexBuilder {
|
||||||
|
fn get_column(&self) -> Option<String>;
|
||||||
|
fn get_index_name(&self) -> Option<String>;
|
||||||
|
fn build(&self) -> VectorIndexParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct IvfPQIndexBuilder {
|
||||||
|
column: Option<String>,
|
||||||
|
index_name: Option<String>,
|
||||||
|
metric_type: Option<MetricType>,
|
||||||
|
ivf_params: Option<IvfBuildParams>,
|
||||||
|
pq_params: Option<PQBuildParams>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IvfPQIndexBuilder {
|
||||||
|
pub fn new() -> IvfPQIndexBuilder {
|
||||||
|
IvfPQIndexBuilder {
|
||||||
|
column: None,
|
||||||
|
index_name: None,
|
||||||
|
metric_type: None,
|
||||||
|
ivf_params: None,
|
||||||
|
pq_params: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IvfPQIndexBuilder {
|
||||||
|
pub fn column(&mut self, column: String) -> &mut IvfPQIndexBuilder {
|
||||||
|
self.column = Some(column);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn index_name(&mut self, index_name: String) -> &mut IvfPQIndexBuilder {
|
||||||
|
self.index_name = Some(index_name);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn metric_type(&mut self, metric_type: MetricType) -> &mut IvfPQIndexBuilder {
|
||||||
|
self.metric_type = Some(metric_type);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ivf_params(&mut self, ivf_params: IvfBuildParams) -> &mut IvfPQIndexBuilder {
|
||||||
|
self.ivf_params = Some(ivf_params);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn pq_params(&mut self, pq_params: PQBuildParams) -> &mut IvfPQIndexBuilder {
|
||||||
|
self.pq_params = Some(pq_params);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VectorIndexBuilder for IvfPQIndexBuilder {
|
||||||
|
fn get_column(&self) -> Option<String> {
|
||||||
|
self.column.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_index_name(&self) -> Option<String> {
|
||||||
|
self.index_name.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build(&self) -> VectorIndexParams {
|
||||||
|
let ivf_params = self.ivf_params.clone().unwrap_or(IvfBuildParams::default());
|
||||||
|
let pq_params = self.pq_params.clone().unwrap_or(PQBuildParams::default());
|
||||||
|
|
||||||
|
VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use lance::index::vector::ivf::IvfBuildParams;
|
||||||
|
use lance::index::vector::pq::PQBuildParams;
|
||||||
|
use lance::index::vector::{MetricType, StageParams};
|
||||||
|
|
||||||
|
use crate::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_builder_no_params() {
|
||||||
|
let index_builder = IvfPQIndexBuilder::new();
|
||||||
|
assert!(index_builder.get_column().is_none());
|
||||||
|
assert!(index_builder.get_index_name().is_none());
|
||||||
|
|
||||||
|
let index_params = index_builder.build();
|
||||||
|
assert_eq!(index_params.stages.len(), 2);
|
||||||
|
if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
|
||||||
|
let default = IvfBuildParams::default();
|
||||||
|
assert_eq!(ivf_params.num_partitions, default.num_partitions);
|
||||||
|
assert_eq!(ivf_params.max_iters, default.max_iters);
|
||||||
|
} else {
|
||||||
|
panic!("Expected first stage to be ivf")
|
||||||
|
}
|
||||||
|
|
||||||
|
if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
|
||||||
|
assert_eq!(pq_params.use_opq, false);
|
||||||
|
} else {
|
||||||
|
panic!("Expected second stage to be pq")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_builder_all_params() {
|
||||||
|
let mut index_builder = IvfPQIndexBuilder::new();
|
||||||
|
|
||||||
|
index_builder
|
||||||
|
.column("c".to_owned())
|
||||||
|
.metric_type(MetricType::Cosine)
|
||||||
|
.index_name("index".to_owned());
|
||||||
|
|
||||||
|
assert_eq!(index_builder.column.clone().unwrap(), "c");
|
||||||
|
assert_eq!(index_builder.metric_type.unwrap(), MetricType::Cosine);
|
||||||
|
assert_eq!(index_builder.index_name.clone().unwrap(), "index");
|
||||||
|
|
||||||
|
let ivf_params = IvfBuildParams::new(500);
|
||||||
|
let mut pq_params = PQBuildParams::default();
|
||||||
|
pq_params.use_opq = true;
|
||||||
|
pq_params.max_iters = 1;
|
||||||
|
pq_params.num_bits = 8;
|
||||||
|
pq_params.num_sub_vectors = 50;
|
||||||
|
pq_params.metric_type = MetricType::Cosine;
|
||||||
|
pq_params.max_opq_iters = 2;
|
||||||
|
index_builder.ivf_params(ivf_params);
|
||||||
|
index_builder.pq_params(pq_params);
|
||||||
|
|
||||||
|
let index_params = index_builder.build();
|
||||||
|
assert_eq!(index_params.stages.len(), 2);
|
||||||
|
if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
|
||||||
|
assert_eq!(ivf_params.num_partitions, 500);
|
||||||
|
} else {
|
||||||
|
assert!(false, "Expected first stage to be ivf")
|
||||||
|
}
|
||||||
|
|
||||||
|
if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
|
||||||
|
assert_eq!(pq_params.use_opq, true);
|
||||||
|
assert_eq!(pq_params.max_iters, 1);
|
||||||
|
assert_eq!(pq_params.num_bits, 8);
|
||||||
|
assert_eq!(pq_params.num_sub_vectors, 50);
|
||||||
|
assert_eq!(pq_params.metric_type, MetricType::Cosine);
|
||||||
|
assert_eq!(pq_params.max_opq_iters, 2);
|
||||||
|
} else {
|
||||||
|
assert!(false, "Expected second stage to be pq")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -14,5 +14,6 @@
|
|||||||
|
|
||||||
pub mod database;
|
pub mod database;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
pub mod index;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod table;
|
pub mod table;
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ pub struct Query {
|
|||||||
pub filter: Option<String>,
|
pub filter: Option<String>,
|
||||||
pub nprobes: usize,
|
pub nprobes: usize,
|
||||||
pub refine_factor: Option<u32>,
|
pub refine_factor: Option<u32>,
|
||||||
pub metric_type: MetricType,
|
pub metric_type: Option<MetricType>,
|
||||||
pub use_index: bool,
|
pub use_index: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,9 +51,9 @@ impl Query {
|
|||||||
limit: 10,
|
limit: 10,
|
||||||
nprobes: 20,
|
nprobes: 20,
|
||||||
refine_factor: None,
|
refine_factor: None,
|
||||||
metric_type: MetricType::L2,
|
metric_type: None,
|
||||||
use_index: false,
|
use_index: false,
|
||||||
filter: None
|
filter: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -71,10 +71,10 @@ impl Query {
|
|||||||
self.limit,
|
self.limit,
|
||||||
)?;
|
)?;
|
||||||
scanner.nprobs(self.nprobes);
|
scanner.nprobs(self.nprobes);
|
||||||
scanner.distance_metric(self.metric_type);
|
|
||||||
scanner.use_index(self.use_index);
|
scanner.use_index(self.use_index);
|
||||||
self.filter.as_ref().map(|f| scanner.filter(f));
|
self.filter.as_ref().map(|f| scanner.filter(f));
|
||||||
self.refine_factor.map(|rf| scanner.refine(rf));
|
self.refine_factor.map(|rf| scanner.refine(rf));
|
||||||
|
self.metric_type.map(|mt| scanner.distance_metric(mt));
|
||||||
Ok(scanner.try_into_stream().await?)
|
Ok(scanner.try_into_stream().await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -123,7 +123,7 @@ impl Query {
|
|||||||
/// # Arguments
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used.
|
/// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used.
|
||||||
pub fn metric_type(mut self, metric_type: MetricType) -> Query {
|
pub fn metric_type(mut self, metric_type: Option<MetricType>) -> Query {
|
||||||
self.metric_type = metric_type;
|
self.metric_type = metric_type;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
@@ -174,14 +174,14 @@ mod tests {
|
|||||||
.limit(100)
|
.limit(100)
|
||||||
.nprobes(1000)
|
.nprobes(1000)
|
||||||
.use_index(true)
|
.use_index(true)
|
||||||
.metric_type(MetricType::Cosine)
|
.metric_type(Some(MetricType::Cosine))
|
||||||
.refine_factor(Some(999));
|
.refine_factor(Some(999));
|
||||||
|
|
||||||
assert_eq!(query.query_vector, new_vector);
|
assert_eq!(query.query_vector, new_vector);
|
||||||
assert_eq!(query.limit, 100);
|
assert_eq!(query.limit, 100);
|
||||||
assert_eq!(query.nprobes, 1000);
|
assert_eq!(query.nprobes, 1000);
|
||||||
assert_eq!(query.use_index, true);
|
assert_eq!(query.use_index, true);
|
||||||
assert_eq!(query.metric_type, MetricType::Cosine);
|
assert_eq!(query.metric_type, Some(MetricType::Cosine));
|
||||||
assert_eq!(query.refine_factor, Some(999));
|
assert_eq!(query.refine_factor, Some(999));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,26 +12,33 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use std::path::PathBuf;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use arrow_array::{Float32Array, RecordBatchReader};
|
use arrow_array::{Float32Array, RecordBatchReader};
|
||||||
use lance::dataset::{Dataset, WriteMode, WriteParams};
|
use lance::dataset::{Dataset, WriteMode, WriteParams};
|
||||||
|
use lance::index::IndexType;
|
||||||
|
|
||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
|
use crate::index::vector::VectorIndexBuilder;
|
||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
|
|
||||||
pub const VECTOR_COLUMN_NAME: &str = "vector";
|
pub const VECTOR_COLUMN_NAME: &str = "vector";
|
||||||
|
|
||||||
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
pub const LANCE_FILE_EXTENSION: &str = "lance";
|
||||||
|
|
||||||
/// A table in a LanceDB database.
|
/// A table in a LanceDB database.
|
||||||
pub struct Table {
|
pub struct Table {
|
||||||
name: String,
|
name: String,
|
||||||
path: String,
|
uri: String,
|
||||||
dataset: Arc<Dataset>,
|
dataset: Arc<Dataset>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for Table {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "Table({})", self.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Table {
|
impl Table {
|
||||||
/// Opens an existing Table
|
/// Opens an existing Table
|
||||||
///
|
///
|
||||||
@@ -43,18 +50,21 @@ impl Table {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// * A [Table] object.
|
/// * A [Table] object.
|
||||||
pub async fn open(base_path: Arc<PathBuf>, name: String) -> Result<Self> {
|
pub async fn open(base_uri: &str, name: &str) -> Result<Self> {
|
||||||
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
let path = Path::new(base_uri);
|
||||||
let ds_uri = ds_path
|
|
||||||
|
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
||||||
|
let uri = table_uri
|
||||||
|
.as_path()
|
||||||
.to_str()
|
.to_str()
|
||||||
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
|
.ok_or(Error::IO(format!("Invalid table name: {}", name)))?;
|
||||||
let dataset = Dataset::open(ds_uri).await?;
|
|
||||||
let table = Table {
|
let dataset = Dataset::open(&uri).await?;
|
||||||
name,
|
Ok(Table {
|
||||||
path: ds_uri.to_string(),
|
name: name.to_string(),
|
||||||
|
uri: uri.to_string(),
|
||||||
dataset: Arc::new(dataset),
|
dataset: Arc::new(dataset),
|
||||||
};
|
})
|
||||||
Ok(table)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new Table
|
/// Creates a new Table
|
||||||
@@ -69,18 +79,44 @@ impl Table {
|
|||||||
///
|
///
|
||||||
/// * A [Table] object.
|
/// * A [Table] object.
|
||||||
pub async fn create(
|
pub async fn create(
|
||||||
base_path: Arc<PathBuf>,
|
base_uri: &str,
|
||||||
name: String,
|
name: &str,
|
||||||
mut batches: Box<dyn RecordBatchReader>,
|
mut batches: Box<dyn RecordBatchReader>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
let base_path = Path::new(base_uri);
|
||||||
let path = ds_path
|
let table_uri = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
|
||||||
|
let uri = table_uri
|
||||||
|
.as_path()
|
||||||
.to_str()
|
.to_str()
|
||||||
.ok_or(Error::IO(format!("Unable to find table {}", name)))?;
|
.ok_or(Error::IO(format!("Invalid table name: {}", name)))?
|
||||||
|
.to_string();
|
||||||
let dataset =
|
let dataset =
|
||||||
Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?);
|
Arc::new(Dataset::write(&mut batches, &uri, Some(WriteParams::default())).await?);
|
||||||
Ok(Table { name, path: path.to_string(), dataset })
|
Ok(Table {
|
||||||
|
name: name.to_string(),
|
||||||
|
uri,
|
||||||
|
dataset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create index on the table.
|
||||||
|
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
||||||
|
use lance::index::DatasetIndexExt;
|
||||||
|
|
||||||
|
let dataset = self
|
||||||
|
.dataset
|
||||||
|
.create_index(
|
||||||
|
&[index_builder
|
||||||
|
.get_column()
|
||||||
|
.unwrap_or(VECTOR_COLUMN_NAME.to_string())
|
||||||
|
.as_str()],
|
||||||
|
IndexType::Vector,
|
||||||
|
index_builder.get_index_name(),
|
||||||
|
&index_builder.build(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
self.dataset = Arc::new(dataset);
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Insert records into this Table
|
/// Insert records into this Table
|
||||||
@@ -95,12 +131,12 @@ impl Table {
|
|||||||
pub async fn add(
|
pub async fn add(
|
||||||
&mut self,
|
&mut self,
|
||||||
mut batches: Box<dyn RecordBatchReader>,
|
mut batches: Box<dyn RecordBatchReader>,
|
||||||
write_mode: Option<WriteMode>
|
write_mode: Option<WriteMode>,
|
||||||
) -> Result<usize> {
|
) -> Result<usize> {
|
||||||
let mut params = WriteParams::default();
|
let mut params = WriteParams::default();
|
||||||
params.mode = write_mode.unwrap_or(WriteMode::Append);
|
params.mode = write_mode.unwrap_or(WriteMode::Append);
|
||||||
|
|
||||||
self.dataset = Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
|
self.dataset = Arc::new(Dataset::write(&mut batches, &self.uri, Some(params)).await?);
|
||||||
Ok(batches.count())
|
Ok(batches.count())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -125,60 +161,72 @@ impl Table {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use arrow_array::{Float32Array, Int32Array, RecordBatch, RecordBatchReader};
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use arrow_array::{
|
||||||
|
Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchReader,
|
||||||
|
};
|
||||||
|
use arrow_data::ArrayDataBuilder;
|
||||||
use arrow_schema::{DataType, Field, Schema};
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
use lance::arrow::RecordBatchBuffer;
|
use lance::arrow::RecordBatchBuffer;
|
||||||
use lance::dataset::{Dataset, WriteMode};
|
use lance::dataset::{Dataset, WriteMode};
|
||||||
use std::sync::Arc;
|
use lance::index::vector::ivf::IvfBuildParams;
|
||||||
|
use lance::index::vector::pq::PQBuildParams;
|
||||||
|
use rand::Rng;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::table::Table;
|
use super::*;
|
||||||
|
use crate::index::vector::IvfPQIndexBuilder;
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_new_table_not_exists() {
|
async fn test_new_table_not_exists() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let path_buf = tmp_dir.into_path();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let table = Table::open(Arc::new(path_buf), "test".to_string()).await;
|
let table = Table::open(&uri, "test").await;
|
||||||
assert!(table.is_err());
|
assert!(table.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_open() {
|
async fn test_open() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let path_buf = tmp_dir.into_path();
|
let dataset_path = tmp_dir.path().join("test.lance");
|
||||||
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
Dataset::write(
|
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
|
||||||
&mut batches,
|
|
||||||
path_buf.join("test.lance").to_str().unwrap(),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let table = Table::open(Arc::new(path_buf), "test".to_string())
|
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let table = Table::open(uri, "test").await.unwrap();
|
||||||
|
|
||||||
assert_eq!(table.name, "test")
|
assert_eq!(table.name, "test")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_object_store_path() {
|
||||||
|
use std::path::Path as StdPath;
|
||||||
|
let p = StdPath::new("s3://bucket/path/to/file");
|
||||||
|
let c = p.join("subfile");
|
||||||
|
assert_eq!(c.to_str().unwrap(), "s3://bucket/path/to/file/subfile");
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_add() {
|
async fn test_add() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let path_buf = tmp_dir.into_path();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
let schema = batches.schema().clone();
|
let schema = batches.schema().clone();
|
||||||
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
|
let mut table = Table::create(&uri, "test", batches).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 10);
|
assert_eq!(table.count_rows().await.unwrap(), 10);
|
||||||
|
|
||||||
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
|
let new_batches: Box<dyn RecordBatchReader> =
|
||||||
schema,
|
Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
|
||||||
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
schema,
|
||||||
)
|
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
||||||
.unwrap()]));
|
)
|
||||||
|
.unwrap()]));
|
||||||
|
|
||||||
table.add(new_batches, None).await.unwrap();
|
table.add(new_batches, None).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 20);
|
assert_eq!(table.count_rows().await.unwrap(), 20);
|
||||||
@@ -188,19 +236,24 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_add_overwrite() {
|
async fn test_add_overwrite() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let path_buf = tmp_dir.into_path();
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
let schema = batches.schema().clone();
|
let schema = batches.schema().clone();
|
||||||
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
|
let mut table = Table::create(uri, "test", batches).await.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 10);
|
assert_eq!(table.count_rows().await.unwrap(), 10);
|
||||||
|
|
||||||
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
|
let new_batches: Box<dyn RecordBatchReader> =
|
||||||
schema,
|
Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
|
||||||
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
schema,
|
||||||
).unwrap()]));
|
vec![Arc::new(Int32Array::from_iter_values(100..110))],
|
||||||
|
)
|
||||||
|
.unwrap()]));
|
||||||
|
|
||||||
table.add(new_batches, Some(WriteMode::Overwrite)).await.unwrap();
|
table
|
||||||
|
.add(new_batches, Some(WriteMode::Overwrite))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
assert_eq!(table.count_rows().await.unwrap(), 10);
|
assert_eq!(table.count_rows().await.unwrap(), 10);
|
||||||
assert_eq!(table.name, "test");
|
assert_eq!(table.name, "test");
|
||||||
}
|
}
|
||||||
@@ -208,21 +261,16 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_search() {
|
async fn test_search() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
let path_buf = tmp_dir.into_path();
|
let dataset_path = tmp_dir.path().join("test.lance");
|
||||||
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
|
||||||
Dataset::write(
|
Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
|
||||||
&mut batches,
|
|
||||||
path_buf.join("test.lance").to_str().unwrap(),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let table = Table::open(Arc::new(path_buf), "test".to_string())
|
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let table = Table::open(uri, "test").await.unwrap();
|
||||||
|
|
||||||
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
||||||
let query = table.search(vector.clone());
|
let query = table.search(vector.clone());
|
||||||
assert_eq!(vector, query.query_vector);
|
assert_eq!(vector, query.query_vector);
|
||||||
@@ -236,4 +284,72 @@ mod tests {
|
|||||||
)
|
)
|
||||||
.unwrap()])
|
.unwrap()])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_create_index() {
|
||||||
|
use arrow_array::RecordBatch;
|
||||||
|
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
|
||||||
|
use rand;
|
||||||
|
use std::iter::repeat_with;
|
||||||
|
|
||||||
|
use arrow_array::Float32Array;
|
||||||
|
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
|
||||||
|
let dimension = 16;
|
||||||
|
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
|
||||||
|
"embeddings",
|
||||||
|
DataType::FixedSizeList(
|
||||||
|
Arc::new(Field::new("item", DataType::Float32, true)),
|
||||||
|
dimension,
|
||||||
|
),
|
||||||
|
false,
|
||||||
|
)]));
|
||||||
|
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
let float_arr = Float32Array::from(
|
||||||
|
repeat_with(|| rng.gen::<f32>())
|
||||||
|
.take(512 * dimension as usize)
|
||||||
|
.collect::<Vec<f32>>(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
|
||||||
|
let batches = RecordBatchBuffer::new(vec![RecordBatch::try_new(
|
||||||
|
schema.clone(),
|
||||||
|
vec![vectors.clone()],
|
||||||
|
)
|
||||||
|
.unwrap()]);
|
||||||
|
|
||||||
|
let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
|
||||||
|
let mut table = Table::create(uri, "test", reader).await.unwrap();
|
||||||
|
|
||||||
|
let mut i = IvfPQIndexBuilder::new();
|
||||||
|
|
||||||
|
let index_builder = i
|
||||||
|
.column("embeddings".to_string())
|
||||||
|
.index_name("my_index".to_string())
|
||||||
|
.ivf_params(IvfBuildParams::new(256))
|
||||||
|
.pq_params(PQBuildParams::default());
|
||||||
|
|
||||||
|
table.create_index(index_builder).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1);
|
||||||
|
assert_eq!(table.count_rows().await.unwrap(), 512);
|
||||||
|
assert_eq!(table.name, "test");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
||||||
|
let list_type = DataType::FixedSizeList(
|
||||||
|
Arc::new(Field::new("item", values.data_type().clone(), true)),
|
||||||
|
list_size,
|
||||||
|
);
|
||||||
|
let data = ArrayDataBuilder::new(list_type)
|
||||||
|
.len(values.len() / list_size as usize)
|
||||||
|
.add_child_data(values.into_data())
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
Ok(FixedSizeListArray::from(data))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user