mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-25 22:29:58 +00:00
This PR adds ColPali support with ColPaliEmbeddings class (tagged "colpali") using ColQwen2.5 for multi-vector text/image embeddings. Also added MultiVector Pydantic type to handle the vector lists. I've added some integration test for the embedding model and some unit test for the new Pydantic type. Could be a template for other ColPali variants as well. or until transformers🤗 starts supporting it. Still `TODO`: - [ ] Documentation - [ ] Add an example _Could also allow Image as query, but didn't work well when testing it._ [ColPali-Engine](https://github.com/illuin-tech/colpali) version: 0.3.9.dev17+g3faee24 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced support for ColPali-based multimodal multi-vector embeddings for both text and images. - Added a new embedding class for generating multi-vector embeddings, configurable for various model and processing options. - Added a new Pydantic type for multi-vector embeddings, supporting validation and schema generation for lists of fixed-dimension vectors. - **Bug Fixes** - Ensured proper asynchronous index creation in query tests for improved reliability. - **Tests** - Added integration tests for ColPali embeddings, including text-to-image search and validation of multi-vector fields. - Added comprehensive tests for the new multi-vector Pydantic type, covering schema, validation, and default value behavior. - **Chores** - Updated optional dependencies to include the ColPali engine. - Added utility to check for availability of flash attention support. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
137 lines
3.3 KiB
TOML
137 lines
3.3 KiB
TOML
[project]
|
|
name = "lancedb"
|
|
# version in Cargo.toml
|
|
dynamic = ["version"]
|
|
dependencies = [
|
|
"deprecation",
|
|
"numpy",
|
|
"overrides>=0.7",
|
|
"packaging",
|
|
"pyarrow>=14",
|
|
"pydantic>=1.10",
|
|
"tqdm>=4.27.0",
|
|
]
|
|
description = "lancedb"
|
|
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
|
license = { file = "LICENSE" }
|
|
readme = "README.md"
|
|
requires-python = ">=3.9"
|
|
keywords = [
|
|
"data-format",
|
|
"data-science",
|
|
"machine-learning",
|
|
"arrow",
|
|
"data-analytics",
|
|
]
|
|
classifiers = [
|
|
"Development Status :: 3 - Alpha",
|
|
"Environment :: Console",
|
|
"Intended Audience :: Science/Research",
|
|
"License :: OSI Approved :: Apache Software License",
|
|
"Operating System :: OS Independent",
|
|
"Programming Language :: Python",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3 :: Only",
|
|
"Programming Language :: Python :: 3.9",
|
|
"Programming Language :: Python :: 3.10",
|
|
"Programming Language :: Python :: 3.11",
|
|
"Programming Language :: Python :: 3.12",
|
|
"Topic :: Scientific/Engineering",
|
|
]
|
|
|
|
[project.urls]
|
|
repository = "https://github.com/lancedb/lancedb"
|
|
|
|
[project.optional-dependencies]
|
|
pylance = [
|
|
"pylance>=0.25",
|
|
]
|
|
tests = [
|
|
"aiohttp",
|
|
"boto3",
|
|
"pandas>=1.4",
|
|
"pytest",
|
|
"pytest-mock",
|
|
"pytest-asyncio",
|
|
"duckdb",
|
|
"pytz",
|
|
"polars>=0.19, <=1.3.0",
|
|
"tantivy",
|
|
"pyarrow-stubs",
|
|
"pylance>=0.25",
|
|
"requests",
|
|
]
|
|
dev = [
|
|
"ruff",
|
|
"pre-commit",
|
|
"pyright",
|
|
'typing-extensions>=4.0.0; python_version < "3.11"',
|
|
]
|
|
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
|
clip = ["torch", "pillow", "open-clip-torch"]
|
|
embeddings = [
|
|
"requests>=2.31.0",
|
|
"openai>=1.6.1",
|
|
"sentence-transformers",
|
|
"torch",
|
|
"pillow",
|
|
"open-clip-torch",
|
|
"cohere",
|
|
"colpali-engine>=0.3.10",
|
|
"huggingface_hub",
|
|
"InstructorEmbedding",
|
|
"google.generativeai",
|
|
"boto3>=1.28.57",
|
|
"awscli>=1.29.57",
|
|
"botocore>=1.31.57",
|
|
"ollama",
|
|
"ibm-watsonx-ai>=1.1.2",
|
|
]
|
|
azure = ["adlfs>=2024.2.0"]
|
|
|
|
[tool.maturin]
|
|
python-source = "python"
|
|
module-name = "lancedb._lancedb"
|
|
|
|
[build-system]
|
|
requires = ["maturin>=1.4"]
|
|
build-backend = "maturin"
|
|
|
|
[tool.ruff.lint]
|
|
select = ["F", "E", "W", "G", "PERF"]
|
|
|
|
[tool.pytest.ini_options]
|
|
addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py"
|
|
markers = [
|
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
"asyncio",
|
|
"s3_test",
|
|
]
|
|
|
|
[tool.pyright]
|
|
include = [
|
|
"python/lancedb/index.py",
|
|
"python/lancedb/rerankers/util.py",
|
|
"python/lancedb/rerankers/__init__.py",
|
|
"python/lancedb/rerankers/voyageai.py",
|
|
"python/lancedb/rerankers/jinaai.py",
|
|
"python/lancedb/rerankers/openai.py",
|
|
"python/lancedb/rerankers/cross_encoder.py",
|
|
"python/lancedb/rerankers/colbert.py",
|
|
"python/lancedb/rerankers/answerdotai.py",
|
|
"python/lancedb/rerankers/cohere.py",
|
|
"python/lancedb/arrow.py",
|
|
"python/lancedb/__init__.py",
|
|
"python/lancedb/types.py",
|
|
"python/lancedb/integrations/__init__.py",
|
|
"python/lancedb/exceptions.py",
|
|
"python/lancedb/background_loop.py",
|
|
"python/lancedb/schema.py",
|
|
"python/lancedb/remote/__init__.py",
|
|
"python/lancedb/remote/errors.py",
|
|
"python/lancedb/embeddings/__init__.py",
|
|
"python/lancedb/_lancedb.pyi",
|
|
]
|
|
exclude = ["python/tests/"]
|
|
pythonVersion = "3.12"
|