mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-18 04:20:39 +00:00
feat: make it easier to create empty tables (#942)
This PR also reworks the table creation utilities significantly so that they are more consistent, built on top of each other, and thoroughly documented.
This commit is contained in:
@@ -13,9 +13,10 @@
|
||||
// limitations under the License.
|
||||
|
||||
import { describe } from 'mocha'
|
||||
import { assert } from 'chai'
|
||||
import { assert, expect, use as chaiUse } from 'chai'
|
||||
import * as chaiAsPromised from 'chai-as-promised'
|
||||
|
||||
import { fromTableToBuffer, makeArrowTable } from '../arrow'
|
||||
import { convertToTable, fromTableToBuffer, makeArrowTable, makeEmptyTable } from '../arrow'
|
||||
import {
|
||||
Field,
|
||||
FixedSizeList,
|
||||
@@ -24,21 +25,79 @@ import {
|
||||
Int32,
|
||||
tableFromIPC,
|
||||
Schema,
|
||||
Float64
|
||||
Float64,
|
||||
type Table,
|
||||
Binary,
|
||||
Bool,
|
||||
Utf8,
|
||||
Struct,
|
||||
List,
|
||||
DataType,
|
||||
Dictionary,
|
||||
Int64
|
||||
} from 'apache-arrow'
|
||||
import { type EmbeddingFunction } from '../embedding/embedding_function'
|
||||
|
||||
describe('Apache Arrow tables', function () {
|
||||
it('customized schema', async function () {
|
||||
chaiUse(chaiAsPromised)
|
||||
|
||||
function sampleRecords (): Array<Record<string, any>> {
|
||||
return [
|
||||
{
|
||||
binary: Buffer.alloc(5),
|
||||
boolean: false,
|
||||
number: 7,
|
||||
string: 'hello',
|
||||
struct: { x: 0, y: 0 },
|
||||
list: ['anime', 'action', 'comedy']
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
// Helper method to verify various ways to create a table
|
||||
async function checkTableCreation (tableCreationMethod: (records: any, recordsReversed: any, schema: Schema) => Promise<Table>): Promise<void> {
|
||||
const records = sampleRecords()
|
||||
const recordsReversed = [{
|
||||
list: ['anime', 'action', 'comedy'],
|
||||
struct: { x: 0, y: 0 },
|
||||
string: 'hello',
|
||||
number: 7,
|
||||
boolean: false,
|
||||
binary: Buffer.alloc(5)
|
||||
}]
|
||||
const schema = new Schema([
|
||||
new Field('binary', new Binary(), false),
|
||||
new Field('boolean', new Bool(), false),
|
||||
new Field('number', new Float64(), false),
|
||||
new Field('string', new Utf8(), false),
|
||||
new Field('struct', new Struct([
|
||||
new Field('x', new Float64(), false),
|
||||
new Field('y', new Float64(), false)
|
||||
])),
|
||||
new Field('list', new List(new Field('item', new Utf8(), false)), false)
|
||||
])
|
||||
|
||||
const table = await tableCreationMethod(records, recordsReversed, schema)
|
||||
schema.fields.forEach((field, idx) => {
|
||||
const actualField = table.schema.fields[idx]
|
||||
assert.isFalse(actualField.nullable)
|
||||
assert.equal(table.getChild(field.name)?.type.toString(), field.type.toString())
|
||||
assert.equal(table.getChildAt(idx)?.type.toString(), field.type.toString())
|
||||
})
|
||||
}
|
||||
|
||||
describe('The function makeArrowTable', function () {
|
||||
it('will use data types from a provided schema instead of inference', async function () {
|
||||
const schema = new Schema([
|
||||
new Field('a', new Int32()),
|
||||
new Field('b', new Float32()),
|
||||
new Field('c', new FixedSizeList(3, new Field('item', new Float16())))
|
||||
new Field('c', new FixedSizeList(3, new Field('item', new Float16()))),
|
||||
new Field('d', new Int64())
|
||||
])
|
||||
const table = makeArrowTable(
|
||||
[
|
||||
{ a: 1, b: 2, c: [1, 2, 3] },
|
||||
{ a: 4, b: 5, c: [4, 5, 6] },
|
||||
{ a: 7, b: 8, c: [7, 8, 9] }
|
||||
{ a: 1, b: 2, c: [1, 2, 3], d: 9 },
|
||||
{ a: 4, b: 5, c: [4, 5, 6], d: 10 },
|
||||
{ a: 7, b: 8, c: [7, 8, 9], d: null }
|
||||
],
|
||||
{ schema }
|
||||
)
|
||||
@@ -52,13 +111,13 @@ describe('Apache Arrow tables', function () {
|
||||
assert.deepEqual(actualSchema, schema)
|
||||
})
|
||||
|
||||
it('default vector column', async function () {
|
||||
it('will assume the column `vector` is FixedSizeList<Float32> by default', async function () {
|
||||
const schema = new Schema([
|
||||
new Field('a', new Float64()),
|
||||
new Field('b', new Float64()),
|
||||
new Field(
|
||||
'vector',
|
||||
new FixedSizeList(3, new Field('item', new Float32()))
|
||||
new FixedSizeList(3, new Field('item', new Float32(), true))
|
||||
)
|
||||
])
|
||||
const table = makeArrowTable([
|
||||
@@ -76,12 +135,12 @@ describe('Apache Arrow tables', function () {
|
||||
assert.deepEqual(actualSchema, schema)
|
||||
})
|
||||
|
||||
it('2 vector columns', async function () {
|
||||
it('can support multiple vector columns', async function () {
|
||||
const schema = new Schema([
|
||||
new Field('a', new Float64()),
|
||||
new Field('b', new Float64()),
|
||||
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))),
|
||||
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16())))
|
||||
new Field('vec1', new FixedSizeList(3, new Field('item', new Float16(), true))),
|
||||
new Field('vec2', new FixedSizeList(3, new Field('item', new Float16(), true)))
|
||||
])
|
||||
const table = makeArrowTable(
|
||||
[
|
||||
@@ -105,4 +164,157 @@ describe('Apache Arrow tables', function () {
|
||||
const actualSchema = actual.schema
|
||||
assert.deepEqual(actualSchema, schema)
|
||||
})
|
||||
|
||||
it('will allow different vector column types', async function () {
|
||||
const table = makeArrowTable(
|
||||
[
|
||||
{ fp16: [1], fp32: [1], fp64: [1] }
|
||||
],
|
||||
{
|
||||
vectorColumns: {
|
||||
fp16: { type: new Float16() },
|
||||
fp32: { type: new Float32() },
|
||||
fp64: { type: new Float64() }
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
assert.equal(table.getChild('fp16')?.type.children[0].type.toString(), new Float16().toString())
|
||||
assert.equal(table.getChild('fp32')?.type.children[0].type.toString(), new Float32().toString())
|
||||
assert.equal(table.getChild('fp64')?.type.children[0].type.toString(), new Float64().toString())
|
||||
})
|
||||
|
||||
it('will use dictionary encoded strings if asked', async function () {
|
||||
const table = makeArrowTable([{ str: 'hello' }])
|
||||
assert.isTrue(DataType.isUtf8(table.getChild('str')?.type))
|
||||
|
||||
const tableWithDict = makeArrowTable([{ str: 'hello' }], { dictionaryEncodeStrings: true })
|
||||
assert.isTrue(DataType.isDictionary(tableWithDict.getChild('str')?.type))
|
||||
|
||||
const schema = new Schema([
|
||||
new Field('str', new Dictionary(new Utf8(), new Int32()))
|
||||
])
|
||||
|
||||
const tableWithDict2 = makeArrowTable([{ str: 'hello' }], { schema })
|
||||
assert.isTrue(DataType.isDictionary(tableWithDict2.getChild('str')?.type))
|
||||
})
|
||||
|
||||
it('will infer data types correctly', async function () {
|
||||
await checkTableCreation(async (records) => makeArrowTable(records))
|
||||
})
|
||||
|
||||
it('will allow a schema to be provided', async function () {
|
||||
await checkTableCreation(async (records, _, schema) => makeArrowTable(records, { schema }))
|
||||
})
|
||||
|
||||
it('will use the field order of any provided schema', async function () {
|
||||
await checkTableCreation(async (_, recordsReversed, schema) => makeArrowTable(recordsReversed, { schema }))
|
||||
})
|
||||
|
||||
it('will make an empty table', async function () {
|
||||
await checkTableCreation(async (_, __, schema) => makeArrowTable([], { schema }))
|
||||
})
|
||||
})
|
||||
|
||||
class DummyEmbedding implements EmbeddingFunction<string> {
|
||||
public readonly sourceColumn = 'string'
|
||||
public readonly embeddingDimension = 2
|
||||
public readonly embeddingDataType = new Float16()
|
||||
|
||||
async embed (data: string[]): Promise<number[][]> {
|
||||
return data.map(
|
||||
() => [0.0, 0.0]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
class DummyEmbeddingWithNoDimension implements EmbeddingFunction<string> {
|
||||
public readonly sourceColumn = 'string'
|
||||
|
||||
async embed (data: string[]): Promise<number[][]> {
|
||||
return data.map(
|
||||
() => [0.0, 0.0]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
describe('convertToTable', function () {
|
||||
it('will infer data types correctly', async function () {
|
||||
await checkTableCreation(async (records) => await convertToTable(records))
|
||||
})
|
||||
|
||||
it('will allow a schema to be provided', async function () {
|
||||
await checkTableCreation(async (records, _, schema) => await convertToTable(records, undefined, { schema }))
|
||||
})
|
||||
|
||||
it('will use the field order of any provided schema', async function () {
|
||||
await checkTableCreation(async (_, recordsReversed, schema) => await convertToTable(recordsReversed, undefined, { schema }))
|
||||
})
|
||||
|
||||
it('will make an empty table', async function () {
|
||||
await checkTableCreation(async (_, __, schema) => await convertToTable([], undefined, { schema }))
|
||||
})
|
||||
|
||||
it('will apply embeddings', async function () {
|
||||
const records = sampleRecords()
|
||||
const table = await convertToTable(records, new DummyEmbedding())
|
||||
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
|
||||
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
|
||||
})
|
||||
|
||||
it('will fail if missing the embedding source column', async function () {
|
||||
return await expect(convertToTable([{ id: 1 }], new DummyEmbedding())).to.be.rejectedWith("'string' was not present")
|
||||
})
|
||||
|
||||
it('use embeddingDimension if embedding missing from table', async function () {
|
||||
const schema = new Schema([
|
||||
new Field('string', new Utf8(), false)
|
||||
])
|
||||
// Simulate getting an empty Arrow table (minus embedding) from some other source
|
||||
// In other words, we aren't starting with records
|
||||
const table = makeEmptyTable(schema)
|
||||
|
||||
// If the embedding specifies the dimension we are fine
|
||||
await fromTableToBuffer(table, new DummyEmbedding())
|
||||
|
||||
// We can also supply a schema and should be ok
|
||||
const schemaWithEmbedding = new Schema([
|
||||
new Field('string', new Utf8(), false),
|
||||
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
|
||||
])
|
||||
await fromTableToBuffer(table, new DummyEmbeddingWithNoDimension(), schemaWithEmbedding)
|
||||
|
||||
// Otherwise we will get an error
|
||||
return await expect(fromTableToBuffer(table, new DummyEmbeddingWithNoDimension())).to.be.rejectedWith('does not specify `embeddingDimension`')
|
||||
})
|
||||
|
||||
it('will apply embeddings to an empty table', async function () {
|
||||
const schema = new Schema([
|
||||
new Field('string', new Utf8(), false),
|
||||
new Field('vector', new FixedSizeList(2, new Field('item', new Float16(), false)), false)
|
||||
])
|
||||
const table = await convertToTable([], new DummyEmbedding(), { schema })
|
||||
assert.isTrue(DataType.isFixedSizeList(table.getChild('vector')?.type))
|
||||
assert.equal(table.getChild('vector')?.type.children[0].type.toString(), new Float16().toString())
|
||||
})
|
||||
|
||||
it('will complain if embeddings present but schema missing embedding column', async function () {
|
||||
const schema = new Schema([
|
||||
new Field('string', new Utf8(), false)
|
||||
])
|
||||
return await expect(convertToTable([], new DummyEmbedding(), { schema })).to.be.rejectedWith('column vector was missing')
|
||||
})
|
||||
|
||||
it('will provide a nice error if run twice', async function () {
|
||||
const records = sampleRecords()
|
||||
const table = await convertToTable(records, new DummyEmbedding())
|
||||
// fromTableToBuffer will try and apply the embeddings again
|
||||
return await expect(fromTableToBuffer(table, new DummyEmbedding())).to.be.rejectedWith('already existed')
|
||||
})
|
||||
})
|
||||
|
||||
describe('makeEmptyTable', function () {
|
||||
it('will make an empty table', async function () {
|
||||
await checkTableCreation(async (_, __, schema) => makeEmptyTable(schema))
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user