Files
lancedb/python/pyproject.toml
Magnus 4f07fea6df feat: add ColPali embedding support with MultiVector type (#2170)
This PR adds ColPali support with ColPaliEmbeddings class (tagged
"colpali") using ColQwen2.5 for multi-vector text/image embeddings. Also
added MultiVector Pydantic type to handle the vector lists.

I've added some integration test for the embedding model and some unit
test for the new Pydantic type. Could be a template for other ColPali
variants as well. or until transformers🤗 starts supporting it.


Still `TODO`:

- [ ] Documentation
- [ ] Add an example

_Could also allow Image as query, but didn't work well when testing it._

[ColPali-Engine](https://github.com/illuin-tech/colpali) version:
0.3.9.dev17+g3faee24

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced support for ColPali-based multimodal multi-vector
embeddings for both text and images.
- Added a new embedding class for generating multi-vector embeddings,
configurable for various model and processing options.
- Added a new Pydantic type for multi-vector embeddings, supporting
validation and schema generation for lists of fixed-dimension vectors.

- **Bug Fixes**
- Ensured proper asynchronous index creation in query tests for improved
reliability.

- **Tests**
- Added integration tests for ColPali embeddings, including
text-to-image search and validation of multi-vector fields.
- Added comprehensive tests for the new multi-vector Pydantic type,
covering schema, validation, and default value behavior.

- **Chores**
  - Updated optional dependencies to include the ColPali engine.
  - Added utility to check for availability of flash attention support.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-04-21 11:47:37 +08:00

137 lines
3.3 KiB
TOML

[project]
name = "lancedb"
# version in Cargo.toml
dynamic = ["version"]
dependencies = [
"deprecation",
"numpy",
"overrides>=0.7",
"packaging",
"pyarrow>=14",
"pydantic>=1.10",
"tqdm>=4.27.0",
]
description = "lancedb"
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.9"
keywords = [
"data-format",
"data-science",
"machine-learning",
"arrow",
"data-analytics",
]
classifiers = [
"Development Status :: 3 - Alpha",
"Environment :: Console",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering",
]
[project.urls]
repository = "https://github.com/lancedb/lancedb"
[project.optional-dependencies]
pylance = [
"pylance>=0.25",
]
tests = [
"aiohttp",
"boto3",
"pandas>=1.4",
"pytest",
"pytest-mock",
"pytest-asyncio",
"duckdb",
"pytz",
"polars>=0.19, <=1.3.0",
"tantivy",
"pyarrow-stubs",
"pylance>=0.25",
"requests",
]
dev = [
"ruff",
"pre-commit",
"pyright",
'typing-extensions>=4.0.0; python_version < "3.11"',
]
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
clip = ["torch", "pillow", "open-clip-torch"]
embeddings = [
"requests>=2.31.0",
"openai>=1.6.1",
"sentence-transformers",
"torch",
"pillow",
"open-clip-torch",
"cohere",
"colpali-engine>=0.3.10",
"huggingface_hub",
"InstructorEmbedding",
"google.generativeai",
"boto3>=1.28.57",
"awscli>=1.29.57",
"botocore>=1.31.57",
"ollama",
"ibm-watsonx-ai>=1.1.2",
]
azure = ["adlfs>=2024.2.0"]
[tool.maturin]
python-source = "python"
module-name = "lancedb._lancedb"
[build-system]
requires = ["maturin>=1.4"]
build-backend = "maturin"
[tool.ruff.lint]
select = ["F", "E", "W", "G", "PERF"]
[tool.pytest.ini_options]
addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py"
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"asyncio",
"s3_test",
]
[tool.pyright]
include = [
"python/lancedb/index.py",
"python/lancedb/rerankers/util.py",
"python/lancedb/rerankers/__init__.py",
"python/lancedb/rerankers/voyageai.py",
"python/lancedb/rerankers/jinaai.py",
"python/lancedb/rerankers/openai.py",
"python/lancedb/rerankers/cross_encoder.py",
"python/lancedb/rerankers/colbert.py",
"python/lancedb/rerankers/answerdotai.py",
"python/lancedb/rerankers/cohere.py",
"python/lancedb/arrow.py",
"python/lancedb/__init__.py",
"python/lancedb/types.py",
"python/lancedb/integrations/__init__.py",
"python/lancedb/exceptions.py",
"python/lancedb/background_loop.py",
"python/lancedb/schema.py",
"python/lancedb/remote/__init__.py",
"python/lancedb/remote/errors.py",
"python/lancedb/embeddings/__init__.py",
"python/lancedb/_lancedb.pyi",
]
exclude = ["python/tests/"]
pythonVersion = "3.12"