mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-10 13:52:58 +00:00
fix: sanitize foreign schemas (#1058)
Arrow-js uses brittle `instanceof` checks throughout the code base. These fail unless the library instance that produced the object matches exactly the same instance the vectordb is using. At a minimum, this means that a user using arrow version 15 (or any version that doesn't match exactly the version that vectordb is using) will get strange errors when they try and use vectordb. However, there are even cases where the versions can be perfectly identical, and the instanceof check still fails. One such example is when using `vite` (e.g. https://github.com/vitejs/vite/issues/3910) This PR solves the problem in a rather brute force, but workable, fashion. If we encounter a schema that does not pass the `instanceof` check then we will attempt to sanitize that schema by traversing the object and, if it has all the correct properties, constructing an appropriate `Schema` instance via deep cloning.
This commit is contained in:
@@ -34,8 +34,20 @@ import {
|
||||
List,
|
||||
DataType,
|
||||
Dictionary,
|
||||
Int64
|
||||
Int64,
|
||||
MetadataVersion
|
||||
} from 'apache-arrow'
|
||||
import {
|
||||
Dictionary as OldDictionary,
|
||||
Field as OldField,
|
||||
FixedSizeList as OldFixedSizeList,
|
||||
Float32 as OldFloat32,
|
||||
Int32 as OldInt32,
|
||||
Struct as OldStruct,
|
||||
Schema as OldSchema,
|
||||
TimestampNanosecond as OldTimestampNanosecond,
|
||||
Utf8 as OldUtf8
|
||||
} from 'apache-arrow-old'
|
||||
import { type EmbeddingFunction } from '../embedding/embedding_function'
|
||||
|
||||
chaiUse(chaiAsPromised)
|
||||
@@ -318,3 +330,31 @@ describe('makeEmptyTable', function () {
|
||||
await checkTableCreation(async (_, __, schema) => makeEmptyTable(schema))
|
||||
})
|
||||
})
|
||||
|
||||
describe('when using two versions of arrow', function () {
|
||||
it('can still import data', async function() {
|
||||
const schema = new OldSchema([
|
||||
new OldField('id', new OldInt32()),
|
||||
new OldField('vector', new OldFixedSizeList(1024, new OldField("item", new OldFloat32(), true))),
|
||||
new OldField('struct', new OldStruct([
|
||||
new OldField('nested', new OldDictionary(new OldUtf8(), new OldInt32(), 1, true)),
|
||||
new OldField('ts_with_tz', new OldTimestampNanosecond("some_tz")),
|
||||
new OldField('ts_no_tz', new OldTimestampNanosecond(null))
|
||||
]))
|
||||
]) as any
|
||||
// We use arrow version 13 to emulate a "foreign arrow" and this version doesn't have metadataVersion
|
||||
// In theory, this wouldn't matter. We don't rely on that property. However, it causes deepEqual to
|
||||
// fail so we patch it back in
|
||||
schema.metadataVersion = MetadataVersion.V5
|
||||
const table = makeArrowTable(
|
||||
[],
|
||||
{ schema }
|
||||
)
|
||||
|
||||
const buf = await fromTableToBuffer(table)
|
||||
assert.isAbove(buf.byteLength, 0)
|
||||
const actual = tableFromIPC(buf)
|
||||
const actualSchema = actual.schema
|
||||
assert.deepEqual(actualSchema, schema)
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user