feat(nodejs): huggingface compatible transformers (#1462)

This commit is contained in:
Cory Grinstead
2024-07-26 12:54:15 -07:00
committed by GitHub
parent 9555efacf9
commit fbfe2444a8
12 changed files with 1806 additions and 52 deletions

View File

@@ -9,7 +9,8 @@
"version": "1.0.0",
"license": "Apache-2.0",
"dependencies": {
"@lancedb/lancedb": "file:../"
"@lancedb/lancedb": "file:../",
"@xenova/transformers": "^2.17.2"
},
"peerDependencies": {
"typescript": "^5.0.0"
@@ -17,7 +18,7 @@
},
"..": {
"name": "@lancedb/lancedb",
"version": "0.6.0",
"version": "0.7.1",
"cpu": [
"x64",
"arm64"
@@ -29,17 +30,16 @@
"win32"
],
"dependencies": {
"apache-arrow": "^15.0.0",
"axios": "^1.7.2",
"openai": "^4.29.2",
"reflect-metadata": "^0.2.2"
},
"devDependencies": {
"@aws-sdk/client-dynamodb": "^3.33.0",
"@aws-sdk/client-kms": "^3.33.0",
"@aws-sdk/client-s3": "^3.33.0",
"@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0",
"@napi-rs/cli": "^2.18.0",
"@napi-rs/cli": "^2.18.3",
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2",
"@types/tmp": "^0.2.6",
@@ -56,12 +56,746 @@
},
"engines": {
"node": ">= 18"
},
"optionalDependencies": {
"@xenova/transformers": "^2.17.2",
"openai": "^4.29.2"
},
"peerDependencies": {
"apache-arrow": "^15.0.0"
}
},
"node_modules/@huggingface/jinja": {
"version": "0.2.2",
"resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz",
"integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==",
"engines": {
"node": ">=18"
}
},
"node_modules/@lancedb/lancedb": {
"resolved": "..",
"link": true
},
"node_modules/@protobufjs/aspromise": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
"integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ=="
},
"node_modules/@protobufjs/base64": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
"integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg=="
},
"node_modules/@protobufjs/codegen": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz",
"integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg=="
},
"node_modules/@protobufjs/eventemitter": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
"integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q=="
},
"node_modules/@protobufjs/fetch": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
"integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
"dependencies": {
"@protobufjs/aspromise": "^1.1.1",
"@protobufjs/inquire": "^1.1.0"
}
},
"node_modules/@protobufjs/float": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
"integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ=="
},
"node_modules/@protobufjs/inquire": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz",
"integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q=="
},
"node_modules/@protobufjs/path": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
"integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA=="
},
"node_modules/@protobufjs/pool": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
"integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw=="
},
"node_modules/@protobufjs/utf8": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
},
"node_modules/@types/long": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
"integrity": "sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA=="
},
"node_modules/@types/node": {
"version": "20.14.11",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz",
"integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==",
"dependencies": {
"undici-types": "~5.26.4"
}
},
"node_modules/@xenova/transformers": {
"version": "2.17.2",
"resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.2.tgz",
"integrity": "sha512-lZmHqzrVIkSvZdKZEx7IYY51TK0WDrC8eR0c5IMnBsO8di8are1zzw8BlLhyO2TklZKLN5UffNGs1IJwT6oOqQ==",
"dependencies": {
"@huggingface/jinja": "^0.2.2",
"onnxruntime-web": "1.14.0",
"sharp": "^0.32.0"
},
"optionalDependencies": {
"onnxruntime-node": "1.14.0"
}
},
"node_modules/b4a": {
"version": "1.6.6",
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz",
"integrity": "sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg=="
},
"node_modules/bare-events": {
"version": "2.4.2",
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.4.2.tgz",
"integrity": "sha512-qMKFd2qG/36aA4GwvKq8MxnPgCQAmBWmSyLWsJcbn8v03wvIPQ/hG1Ms8bPzndZxMDoHpxez5VOS+gC9Yi24/Q==",
"optional": true
},
"node_modules/bare-fs": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-2.3.1.tgz",
"integrity": "sha512-W/Hfxc/6VehXlsgFtbB5B4xFcsCl+pAh30cYhoFyXErf6oGrwjh8SwiPAdHgpmWonKuYpZgGywN0SXt7dgsADA==",
"optional": true,
"dependencies": {
"bare-events": "^2.0.0",
"bare-path": "^2.0.0",
"bare-stream": "^2.0.0"
}
},
"node_modules/bare-os": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.4.0.tgz",
"integrity": "sha512-v8DTT08AS/G0F9xrhyLtepoo9EJBJ85FRSMbu1pQUlAf6A8T0tEEQGMVObWeqpjhSPXsE0VGlluFBJu2fdoTNg==",
"optional": true
},
"node_modules/bare-path": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-2.1.3.tgz",
"integrity": "sha512-lh/eITfU8hrj9Ru5quUp0Io1kJWIk1bTjzo7JH1P5dWmQ2EL4hFUlfI8FonAhSlgIfhn63p84CDY/x+PisgcXA==",
"optional": true,
"dependencies": {
"bare-os": "^2.1.0"
}
},
"node_modules/bare-stream": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.1.3.tgz",
"integrity": "sha512-tiDAH9H/kP+tvNO5sczyn9ZAA7utrSMobyDchsnyyXBuUe2FSQWbxhtuHB8jwpHYYevVo2UJpcmvvjrbHboUUQ==",
"optional": true,
"dependencies": {
"streamx": "^2.18.0"
}
},
"node_modules/base64-js": {
"version": "1.5.1",
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/bl": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
"integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
"dependencies": {
"buffer": "^5.5.0",
"inherits": "^2.0.4",
"readable-stream": "^3.4.0"
}
},
"node_modules/buffer": {
"version": "5.7.1",
"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"dependencies": {
"base64-js": "^1.3.1",
"ieee754": "^1.1.13"
}
},
"node_modules/chownr": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
},
"node_modules/color": {
"version": "4.2.3",
"resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz",
"integrity": "sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==",
"dependencies": {
"color-convert": "^2.0.1",
"color-string": "^1.9.0"
},
"engines": {
"node": ">=12.5.0"
}
},
"node_modules/color-convert": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
"dependencies": {
"color-name": "~1.1.4"
},
"engines": {
"node": ">=7.0.0"
}
},
"node_modules/color-name": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="
},
"node_modules/color-string": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz",
"integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==",
"dependencies": {
"color-name": "^1.0.0",
"simple-swizzle": "^0.2.2"
}
},
"node_modules/decompress-response": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
"integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
"dependencies": {
"mimic-response": "^3.1.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/deep-extend": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
"integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/detect-libc": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz",
"integrity": "sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw==",
"engines": {
"node": ">=8"
}
},
"node_modules/end-of-stream": {
"version": "1.4.4",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz",
"integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==",
"dependencies": {
"once": "^1.4.0"
}
},
"node_modules/expand-template": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
"integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==",
"engines": {
"node": ">=6"
}
},
"node_modules/fast-fifo": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ=="
},
"node_modules/flatbuffers": {
"version": "1.12.0",
"resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz",
"integrity": "sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ=="
},
"node_modules/fs-constants": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow=="
},
"node_modules/github-from-package": {
"version": "0.0.0",
"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw=="
},
"node_modules/guid-typescript": {
"version": "1.0.9",
"resolved": "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz",
"integrity": "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ=="
},
"node_modules/ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/inherits": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
},
"node_modules/ini": {
"version": "1.3.8",
"resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew=="
},
"node_modules/is-arrayish": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
},
"node_modules/long": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz",
"integrity": "sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA=="
},
"node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimist": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/mkdirp-classic": {
"version": "0.5.3",
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
},
"node_modules/napi-build-utils": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz",
"integrity": "sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg=="
},
"node_modules/node-abi": {
"version": "3.65.0",
"resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.65.0.tgz",
"integrity": "sha512-ThjYBfoDNr08AWx6hGaRbfPwxKV9kVzAzOzlLKbk2CuqXE2xnCh+cbAGnwM3t8Lq4v9rUB7VfondlkBckcJrVA==",
"dependencies": {
"semver": "^7.3.5"
},
"engines": {
"node": ">=10"
}
},
"node_modules/node-addon-api": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
"integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA=="
},
"node_modules/once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
"dependencies": {
"wrappy": "1"
}
},
"node_modules/onnx-proto": {
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz",
"integrity": "sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==",
"dependencies": {
"protobufjs": "^6.8.8"
}
},
"node_modules/onnxruntime-common": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz",
"integrity": "sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew=="
},
"node_modules/onnxruntime-node": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz",
"integrity": "sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==",
"optional": true,
"os": [
"win32",
"darwin",
"linux"
],
"dependencies": {
"onnxruntime-common": "~1.14.0"
}
},
"node_modules/onnxruntime-web": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz",
"integrity": "sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==",
"dependencies": {
"flatbuffers": "^1.12.0",
"guid-typescript": "^1.0.9",
"long": "^4.0.0",
"onnx-proto": "^4.0.4",
"onnxruntime-common": "~1.14.0",
"platform": "^1.3.6"
}
},
"node_modules/platform": {
"version": "1.3.6",
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
},
"node_modules/prebuild-install": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.2.tgz",
"integrity": "sha512-UnNke3IQb6sgarcZIDU3gbMeTp/9SSU1DAIkil7PrqG1vZlBtY5msYccSKSHDqa3hNg436IXK+SNImReuA1wEQ==",
"dependencies": {
"detect-libc": "^2.0.0",
"expand-template": "^2.0.3",
"github-from-package": "0.0.0",
"minimist": "^1.2.3",
"mkdirp-classic": "^0.5.3",
"napi-build-utils": "^1.0.1",
"node-abi": "^3.3.0",
"pump": "^3.0.0",
"rc": "^1.2.7",
"simple-get": "^4.0.0",
"tar-fs": "^2.0.0",
"tunnel-agent": "^0.6.0"
},
"bin": {
"prebuild-install": "bin.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/prebuild-install/node_modules/tar-fs": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz",
"integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==",
"dependencies": {
"chownr": "^1.1.1",
"mkdirp-classic": "^0.5.2",
"pump": "^3.0.0",
"tar-stream": "^2.1.4"
}
},
"node_modules/prebuild-install/node_modules/tar-stream": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz",
"integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
"dependencies": {
"bl": "^4.0.3",
"end-of-stream": "^1.4.1",
"fs-constants": "^1.0.0",
"inherits": "^2.0.3",
"readable-stream": "^3.1.1"
},
"engines": {
"node": ">=6"
}
},
"node_modules/protobufjs": {
"version": "6.11.4",
"resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz",
"integrity": "sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==",
"hasInstallScript": true,
"dependencies": {
"@protobufjs/aspromise": "^1.1.2",
"@protobufjs/base64": "^1.1.2",
"@protobufjs/codegen": "^2.0.4",
"@protobufjs/eventemitter": "^1.1.0",
"@protobufjs/fetch": "^1.1.0",
"@protobufjs/float": "^1.0.2",
"@protobufjs/inquire": "^1.1.0",
"@protobufjs/path": "^1.1.2",
"@protobufjs/pool": "^1.1.0",
"@protobufjs/utf8": "^1.1.0",
"@types/long": "^4.0.1",
"@types/node": ">=13.7.0",
"long": "^4.0.0"
},
"bin": {
"pbjs": "bin/pbjs",
"pbts": "bin/pbts"
}
},
"node_modules/pump": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
"integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
"dependencies": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"node_modules/queue-tick": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz",
"integrity": "sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag=="
},
"node_modules/rc": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz",
"integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==",
"dependencies": {
"deep-extend": "^0.6.0",
"ini": "~1.3.0",
"minimist": "^1.2.0",
"strip-json-comments": "~2.0.1"
},
"bin": {
"rc": "cli.js"
}
},
"node_modules/readable-stream": {
"version": "3.6.2",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
"integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
"dependencies": {
"inherits": "^2.0.3",
"string_decoder": "^1.1.1",
"util-deprecate": "^1.0.1"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/safe-buffer": {
"version": "5.2.1",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
"integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/semver": {
"version": "7.6.3",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz",
"integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==",
"bin": {
"semver": "bin/semver.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/sharp": {
"version": "0.32.6",
"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
"integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
"hasInstallScript": true,
"dependencies": {
"color": "^4.2.3",
"detect-libc": "^2.0.2",
"node-addon-api": "^6.1.0",
"prebuild-install": "^7.1.1",
"semver": "^7.5.4",
"simple-get": "^4.0.1",
"tar-fs": "^3.0.4",
"tunnel-agent": "^0.6.0"
},
"engines": {
"node": ">=14.15.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/simple-concat": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz",
"integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/simple-get": {
"version": "4.0.1",
"resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz",
"integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"dependencies": {
"decompress-response": "^6.0.0",
"once": "^1.3.1",
"simple-concat": "^1.0.0"
}
},
"node_modules/simple-swizzle": {
"version": "0.2.2",
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
"integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
"dependencies": {
"is-arrayish": "^0.3.1"
}
},
"node_modules/streamx": {
"version": "2.18.0",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.18.0.tgz",
"integrity": "sha512-LLUC1TWdjVdn1weXGcSxyTR3T4+acB6tVGXT95y0nGbca4t4o/ng1wKAGTljm9VicuCVLvRlqFYXYy5GwgM7sQ==",
"dependencies": {
"fast-fifo": "^1.3.2",
"queue-tick": "^1.0.1",
"text-decoder": "^1.1.0"
},
"optionalDependencies": {
"bare-events": "^2.2.0"
}
},
"node_modules/string_decoder": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
"integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
"dependencies": {
"safe-buffer": "~5.2.0"
}
},
"node_modules/strip-json-comments": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz",
"integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/tar-fs": {
"version": "3.0.6",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.6.tgz",
"integrity": "sha512-iokBDQQkUyeXhgPYaZxmczGPhnhXZ0CmrqI+MOb/WFGS9DW5wnfrLgtjUJBvz50vQ3qfRwJ62QVoCFu8mPVu5w==",
"dependencies": {
"pump": "^3.0.0",
"tar-stream": "^3.1.5"
},
"optionalDependencies": {
"bare-fs": "^2.1.1",
"bare-path": "^2.1.0"
}
},
"node_modules/tar-stream": {
"version": "3.1.7",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
"dependencies": {
"b4a": "^1.6.4",
"fast-fifo": "^1.2.0",
"streamx": "^2.15.0"
}
},
"node_modules/text-decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.1.tgz",
"integrity": "sha512-8zll7REEv4GDD3x4/0pW+ppIxSNs7H1J10IKFZsuOMscumCdM2a+toDGLPA3T+1+fLBql4zbt5z83GEQGGV5VA==",
"dependencies": {
"b4a": "^1.6.4"
}
},
"node_modules/tunnel-agent": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
"dependencies": {
"safe-buffer": "^5.0.1"
},
"engines": {
"node": "*"
}
},
"node_modules/typescript": {
"version": "5.5.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
@@ -74,6 +808,21 @@
"engines": {
"node": ">=14.17"
}
},
"node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="
},
"node_modules/util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
},
"node_modules/wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
}
}
}

View File

@@ -10,7 +10,8 @@
"author": "Lance Devs",
"license": "Apache-2.0",
"dependencies": {
"@lancedb/lancedb": "file:../"
"@lancedb/lancedb": "file:../",
"@xenova/transformers": "^2.17.2"
},
"peerDependencies": {
"typescript": "^5.0.0"

View File

@@ -0,0 +1,50 @@
import * as lancedb from "@lancedb/lancedb";
import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding";
import { Utf8 } from "apache-arrow";
const db = await lancedb.connect("/tmp/db");
const func = await getRegistry().get("huggingface").create();
const facts = [
"Albert Einstein was a theoretical physicist.",
"The capital of France is Paris.",
"The Great Wall of China is one of the Seven Wonders of the World.",
"Python is a popular programming language.",
"Mount Everest is the highest mountain in the world.",
"Leonardo da Vinci painted the Mona Lisa.",
"Shakespeare wrote Hamlet.",
"The human body has 206 bones.",
"The speed of light is approximately 299,792 kilometers per second.",
"Water boils at 100 degrees Celsius.",
"The Earth orbits the Sun.",
"The Pyramids of Giza are located in Egypt.",
"Coffee is one of the most popular beverages in the world.",
"Tokyo is the capital city of Japan.",
"Photosynthesis is the process by which plants make their food.",
"The Pacific Ocean is the largest ocean on Earth.",
"Mozart was a prolific composer of classical music.",
"The Internet is a global network of computers.",
"Basketball is a sport played with a ball and a hoop.",
"The first computer virus was created in 1983.",
"Artificial neural networks are inspired by the human brain.",
"Deep learning is a subset of machine learning.",
"IBM's Watson won Jeopardy! in 2011.",
"The first computer programmer was Ada Lovelace.",
"The first chatbot was ELIZA, created in the 1960s.",
].map((text) => ({ text }));
const factsSchema = LanceSchema({
text: func.sourceField(new Utf8()),
vector: func.vectorField(),
});
const tbl = await db.createTable("facts", facts, {
mode: "overwrite",
schema: factsSchema,
});
const query = "How many bones are in the human body?";
const actual = await tbl.search(query).limit(1).toArray();
console.log("Answer: ", actual[0]["text"]);

View File

@@ -578,7 +578,7 @@ async function applyEmbeddingsFromMetadata(
schema: Schema,
): Promise<ArrowTable> {
const registry = getRegistry();
const functions = registry.parseFunctions(schema.metadata);
const functions = await registry.parseFunctions(schema.metadata);
const columns = Object.fromEntries(
table.schema.fields.map((field) => [

View File

@@ -240,6 +240,7 @@ export class LocalConnection extends Connection {
): Promise<Table> {
if (typeof nameOrOptions !== "string" && "name" in nameOrOptions) {
const { name, data, ...options } = nameOrOptions;
return this.createTable(name, data, options);
}
if (data === undefined) {

View File

@@ -41,6 +41,7 @@ export interface EmbeddingFunctionConstructor<
> {
new (modelOptions?: T["TOptions"]): T;
}
/**
* An embedding function that automatically creates vector representation for a given column.
*/
@@ -82,6 +83,8 @@ export abstract class EmbeddingFunction<
*/
abstract toJSON(): Partial<M>;
async init?(): Promise<void>;
/**
* sourceField is used in combination with `LanceSchema` to provide a declarative data model
*

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { DataType, Field, Schema } from "../arrow";
import { Field, Schema } from "../arrow";
import { isDataType } from "../arrow";
import { sanitizeType } from "../sanitize";
import { EmbeddingFunction } from "./embedding_function";
@@ -22,6 +22,7 @@ export { EmbeddingFunction } from "./embedding_function";
// We need to explicitly export '*' so that the `register` decorator actually registers the class.
export * from "./openai";
export * from "./transformers";
export * from "./registry";
/**

View File

@@ -18,9 +18,14 @@ import {
} from "./embedding_function";
import "reflect-metadata";
import { OpenAIEmbeddingFunction } from "./openai";
import { TransformersEmbeddingFunction } from "./transformers";
type CreateReturnType<T> = T extends { init: () => Promise<void> }
? Promise<T>
: T;
interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
create(options?: T["TOptions"]): T;
create(options?: T["TOptions"]): CreateReturnType<T>;
}
/**
@@ -61,38 +66,43 @@ export class EmbeddingFunctionRegistry {
};
}
get(name: "openai"): EmbeddingFunctionCreate<OpenAIEmbeddingFunction>;
get(
name: "huggingface",
): EmbeddingFunctionCreate<TransformersEmbeddingFunction>;
get<T extends EmbeddingFunction<unknown>>(
name: string,
): EmbeddingFunctionCreate<T> | undefined;
/**
* Fetch an embedding function by name
* @param name The name of the function
*/
get<T extends EmbeddingFunction<unknown>, Name extends string = "">(
name: Name extends "openai" ? "openai" : string,
//This makes it so that you can use string constants as "types", or use an explicitly supplied type
// ex:
// `registry.get("openai") -> EmbeddingFunctionCreate<OpenAIEmbeddingFunction>`
// `registry.get<MyCustomEmbeddingFunction>("my_func") -> EmbeddingFunctionCreate<MyCustomEmbeddingFunction> | undefined`
//
// the reason this is important is that we always know our built in functions are defined so the user isnt forced to do a non null/undefined
// ```ts
// const openai: OpenAIEmbeddingFunction = registry.get("openai").create()
// ```
): Name extends "openai"
? EmbeddingFunctionCreate<OpenAIEmbeddingFunction>
: EmbeddingFunctionCreate<T> | undefined {
type Output = Name extends "openai"
? EmbeddingFunctionCreate<OpenAIEmbeddingFunction>
: EmbeddingFunctionCreate<T> | undefined;
get(name: string) {
const factory = this.#functions.get(name);
if (!factory) {
return undefined as Output;
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
return undefined as any;
}
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
let create: any;
if (factory.prototype.init) {
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
create = async function (options?: any) {
const instance = new factory(options);
await instance.init!();
return instance;
};
} else {
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
create = function (options?: any) {
const instance = new factory(options);
return instance;
};
}
return {
create: function (options?: T["TOptions"]) {
return new factory(options);
},
} as Output;
create,
};
}
/**
@@ -105,10 +115,10 @@ export class EmbeddingFunctionRegistry {
/**
* @ignore
*/
parseFunctions(
async parseFunctions(
this: EmbeddingFunctionRegistry,
metadata: Map<string, string>,
): Map<string, EmbeddingFunctionConfig> {
): Promise<Map<string, EmbeddingFunctionConfig>> {
if (!metadata.has("embedding_functions")) {
return new Map();
} else {
@@ -118,25 +128,30 @@ export class EmbeddingFunctionRegistry {
vectorColumn: string;
model: EmbeddingFunction["TOptions"];
};
const functions = <FunctionConfig[]>(
JSON.parse(metadata.get("embedding_functions")!)
);
return new Map(
functions.map((f) => {
const items: [string, EmbeddingFunctionConfig][] = await Promise.all(
functions.map(async (f) => {
const fn = this.get(f.name);
if (!fn) {
throw new Error(`Function "${f.name}" not found in registry`);
}
const func = await this.get(f.name)!.create(f.model);
return [
f.name,
{
sourceColumn: f.sourceColumn,
vectorColumn: f.vectorColumn,
function: this.get(f.name)!.create(f.model),
function: func,
},
];
}),
);
return new Map(items);
}
}
// biome-ignore lint/suspicious/noExplicitAny: <explanation>

View File

@@ -0,0 +1,193 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import { Float, Float32 } from "../arrow";
import { EmbeddingFunction } from "./embedding_function";
import { register } from "./registry";
export type XenovaTransformerOptions = {
/** The wasm compatible model to use */
model: string;
/**
* The wasm compatible tokenizer to use
* If not provided, it will use the default tokenizer for the model
*/
tokenizer?: string;
/**
* The number of dimensions of the embeddings
*
* We will attempt to infer this from the model config if not provided.
* Since there isn't a standard way to get this information from the model,
* you may need to manually specify this if using a model that doesn't have a 'hidden_size' in the config.
* */
ndims?: number;
/** Options for the tokenizer */
tokenizerOptions?: {
textPair?: string | string[];
padding?: boolean | "max_length";
addSpecialTokens?: boolean;
truncation?: boolean;
maxLength?: number;
};
};
@register("huggingface")
export class TransformersEmbeddingFunction extends EmbeddingFunction<
string,
Partial<XenovaTransformerOptions>
> {
#model?: import("@xenova/transformers").PreTrainedModel;
#tokenizer?: import("@xenova/transformers").PreTrainedTokenizer;
#modelName: XenovaTransformerOptions["model"];
#initialized = false;
#tokenizerOptions: XenovaTransformerOptions["tokenizerOptions"];
#ndims?: number;
constructor(
options: Partial<XenovaTransformerOptions> = {
model: "Xenova/all-MiniLM-L6-v2",
},
) {
super();
const modelName = options?.model ?? "Xenova/all-MiniLM-L6-v2";
this.#tokenizerOptions = {
padding: true,
...options.tokenizerOptions,
};
this.#ndims = options.ndims;
this.#modelName = modelName;
}
toJSON() {
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
const obj: Record<string, any> = {
model: this.#modelName,
};
if (this.#ndims) {
obj["ndims"] = this.#ndims;
}
if (this.#tokenizerOptions) {
obj["tokenizerOptions"] = this.#tokenizerOptions;
}
if (this.#tokenizer) {
obj["tokenizer"] = this.#tokenizer.name;
}
return obj;
}
async init() {
let transformers;
try {
// SAFETY:
// since typescript transpiles `import` to `require`, we need to do this in an unsafe way
// We can't use `require` because `@xenova/transformers` is an ESM module
// and we can't use `import` directly because typescript will transpile it to `require`.
// and we want to remain compatible with both ESM and CJS modules
// so we use `eval` to bypass typescript for this specific import.
transformers = await eval('import("@xenova/transformers")');
} catch (e) {
throw new Error(`error loading @xenova/transformers\nReason: ${e}`);
}
try {
this.#model = await transformers.AutoModel.from_pretrained(
this.#modelName,
);
} catch (e) {
throw new Error(
`error loading model ${this.#modelName}. Make sure you are using a wasm compatible model.\nReason: ${e}`,
);
}
try {
this.#tokenizer = await transformers.AutoTokenizer.from_pretrained(
this.#modelName,
);
} catch (e) {
throw new Error(
`error loading tokenizer for ${this.#modelName}. Make sure you are using a wasm compatible model:\nReason: ${e}`,
);
}
this.#initialized = true;
}
ndims(): number {
if (this.#ndims) {
return this.#ndims;
} else {
const config = this.#model!.config;
const ndims = config["hidden_size"];
if (!ndims) {
throw new Error(
"hidden_size not found in model config, you may need to manually specify the embedding dimensions. ",
);
}
return ndims;
}
}
embeddingDataType(): Float {
return new Float32();
}
async computeSourceEmbeddings(data: string[]): Promise<number[][]> {
// this should only happen if the user is trying to use the function directly.
// Anything going through the registry should already be initialized.
if (!this.#initialized) {
return Promise.reject(
new Error(
"something went wrong: embedding function not initialized. Please call init()",
),
);
}
const tokenizer = this.#tokenizer!;
const model = this.#model!;
const inputs = await tokenizer(data, this.#tokenizerOptions);
let tokens = await model.forward(inputs);
tokens = tokens[Object.keys(tokens)[0]];
const [nItems, nTokens] = tokens.dims;
tokens = tensorDiv(tokens.sum(1), nTokens);
// TODO: support other data types
const tokenData = tokens.data;
const stride = this.ndims();
const embeddings = [];
for (let i = 0; i < nItems; i++) {
const start = i * stride;
const end = start + stride;
const slice = tokenData.slice(start, end);
embeddings.push(Array.from(slice) as number[]); // TODO: Avoid copy here
}
return embeddings;
}
async computeQueryEmbeddings(data: string): Promise<number[]> {
return (await this.computeSourceEmbeddings([data]))[0];
}
}
const tensorDiv = (
src: import("@xenova/transformers").Tensor,
divBy: number,
) => {
for (let i = 0; i < src.data.length; ++i) {
src.data[i] /= divBy;
}
return src;
};

View File

@@ -493,7 +493,7 @@ export class LocalTable extends Table {
const mode = options?.mode ?? "append";
const schema = await this.schema();
const registry = getRegistry();
const functions = registry.parseFunctions(schema.metadata);
const functions = await registry.parseFunctions(schema.metadata);
const buffer = await fromDataToBuffer(
data,

762
nodejs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -32,12 +32,13 @@
},
"license": "Apache 2.0",
"devDependencies": {
"@aws-sdk/client-dynamodb": "^3.33.0",
"@aws-sdk/client-kms": "^3.33.0",
"@aws-sdk/client-s3": "^3.33.0",
"@aws-sdk/client-dynamodb": "^3.33.0",
"@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0",
"@napi-rs/cli": "^2.18.3",
"@types/axios": "^0.14.0",
"@types/jest": "^29.1.2",
"@types/tmp": "^0.2.6",
"apache-arrow-13": "npm:apache-arrow@13.0.0",
@@ -53,8 +54,7 @@
"typedoc": "^0.26.4",
"typedoc-plugin-markdown": "^4.2.1",
"typescript": "^5.3.3",
"typescript-eslint": "^7.1.0",
"@types/axios": "^0.14.0"
"typescript-eslint": "^7.1.0"
},
"ava": {
"timeout": "3m"
@@ -85,6 +85,7 @@
"reflect-metadata": "^0.2.2"
},
"optionalDependencies": {
"@xenova/transformers": ">=2.17 < 3",
"openai": "^4.29.2"
},
"peerDependencies": {