diff --git a/.github/workflows/docs_test.yml b/.github/workflows/docs_test.yml deleted file mode 100644 index 934e92c7..00000000 --- a/.github/workflows/docs_test.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Documentation Code Testing - -on: - push: - branches: - - main - paths: - - docs/** - - .github/workflows/docs_test.yml - pull_request: - paths: - - docs/** - - .github/workflows/docs_test.yml - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma" - RUST_BACKTRACE: "1" - -jobs: - test-python: - name: Test doc python code - runs-on: warp-ubuntu-2204-x64-8x - timeout-minutes: 60 - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Print CPU capabilities - run: cat /proc/cpuinfo - - name: Install protobuf - run: | - sudo apt update - sudo apt install -y protobuf-compiler - - name: Install dependecies needed for ubuntu - run: | - sudo apt install -y libssl-dev - rustup update && rustup default - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.11 - cache: "pip" - cache-dependency-path: "docs/test/requirements.txt" - - name: Rust cache - uses: swatinem/rust-cache@v2 - - name: Build Python - working-directory: docs/test - run: - python -m pip install --extra-index-url https://pypi.fury.io/lancedb/ -r requirements.txt - - name: Create test files - run: | - cd docs/test - python md_testing.py - - name: Test - run: | - cd docs/test/python - for d in *; do cd "$d"; echo "$d".py; python "$d".py; cd ..; done diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 1cf31d33..a801520b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -41,7 +41,6 @@ theme: icon: repo: fontawesome/brands/github annotation: material/arrow-right-circle - custom_dir: overrides plugins: - search @@ -49,7 +48,9 @@ plugins: - mkdocstrings: handlers: python: - paths: [../python] + # Ensure the handler points to the real package root + # so it reads local sources at python/python/lancedb + paths: [../python/python] options: docstring_style: numpy heading_level: 3 @@ -65,26 +66,24 @@ plugins: # for cross references - https://arrow.apache.org/docs/objects.inv - https://pandas.pydata.org/docs/objects.inv - - https://lancedb.github.io/lance/objects.inv - https://docs.pydantic.dev/latest/objects.inv - - mkdocs-jupyter - render_swagger: allow_arbitrary_locations: true - - redirects: - redirect_maps: - # Redirect the home page and other top-level markdown files. This enables maximum SEO benefit - # other sub-pages are handled by the ingected js in overrides/partials/header.html - 'index.md': 'https://lancedb.com/docs/' - 'guides/tables.md': 'https://lancedb.com/docs/tables/' - 'ann_indexes.md': 'https://lancedb.com/docs/indexing/' - 'basic.md': 'https://lancedb.com/docs/quickstart/' - 'faq.md': 'https://lancedb.com/docs/faq/' - 'embeddings/understanding_embeddings.md': 'https://lancedb.com/docs/embedding/' - 'integrations.md': 'https://lancedb.com/docs/integrations/' - 'examples.md': 'https://lancedb.com/docs/tutorials/' - 'concepts/vector_search.md': 'https://lancedb.com/docs/search/vector-search/' - 'troubleshooting.md': 'https://lancedb.com/docs/troubleshooting/' - 'guides/storage.md': 'https://lancedb.com/docs/storage/integrations' +# - redirects: +# redirect_maps: +# # Redirect the home page and other top-level markdown files. This enables maximum SEO benefit +# # other sub-pages are handled by the ingected js in overrides/partials/header.html +# 'index.md': 'https://lancedb.com/docs/' +# 'guides/tables.md': 'https://lancedb.com/docs/tables/' +# 'ann_indexes.md': 'https://lancedb.com/docs/indexing/' +# 'basic.md': 'https://lancedb.com/docs/quickstart/' +# 'faq.md': 'https://lancedb.com/docs/faq/' +# 'embeddings/understanding_embeddings.md': 'https://lancedb.com/docs/embedding/' +# 'integrations.md': 'https://lancedb.com/docs/integrations/' +# 'examples.md': 'https://lancedb.com/docs/tutorials/' +# 'concepts/vector_search.md': 'https://lancedb.com/docs/search/vector-search/' +# 'troubleshooting.md': 'https://lancedb.com/docs/troubleshooting/' +# 'guides/storage.md': 'https://lancedb.com/docs/storage/integrations' @@ -120,269 +119,10 @@ markdown_extensions: permalink: "" nav: - - Home: - - LanceDB: index.md - - 🏃🏼‍♂️ Quick start: basic.md - - 📚 Concepts: - - Vector search: concepts/vector_search.md - - Indexing: - - IVFPQ: concepts/index_ivfpq.md - - HNSW: concepts/index_hnsw.md - - Storage: concepts/storage.md - - Data management: concepts/data_management.md - - 🔨 Guides: - - Working with tables: guides/tables.md - - Building a vector index: ann_indexes.md - - Vector Search: search.md - - Full-text search (native): fts.md - - Full-text search (tantivy-based): fts_tantivy.md - - Building a scalar index: guides/scalar_index.md - - Hybrid search: - - Overview: hybrid_search/hybrid_search.md - - Comparing Rerankers: hybrid_search/eval.md - - Airbnb financial data example: notebooks/hybrid_search.ipynb - - Late interaction with MultiVector search: - - Overview: guides/multi-vector.md - - Example: notebooks/Multivector_on_LanceDB.ipynb - - RAG: - - Vanilla RAG: rag/vanilla_rag.md - - Multi-head RAG: rag/multi_head_rag.md - - Corrective RAG: rag/corrective_rag.md - - Agentic RAG: rag/agentic_rag.md - - Graph RAG: rag/graph_rag.md - - Self RAG: rag/self_rag.md - - Adaptive RAG: rag/adaptive_rag.md - - SFR RAG: rag/sfr_rag.md - - Advanced Techniques: - - HyDE: rag/advanced_techniques/hyde.md - - FLARE: rag/advanced_techniques/flare.md - - Reranking: - - Quickstart: reranking/index.md - - Cohere Reranker: reranking/cohere.md - - Linear Combination Reranker: reranking/linear_combination.md - - Reciprocal Rank Fusion Reranker: reranking/rrf.md - - Cross Encoder Reranker: reranking/cross_encoder.md - - ColBERT Reranker: reranking/colbert.md - - Jina Reranker: reranking/jina.md - - OpenAI Reranker: reranking/openai.md - - AnswerDotAi Rerankers: reranking/answerdotai.md - - Voyage AI Rerankers: reranking/voyageai.md - - Building Custom Rerankers: reranking/custom_reranker.md - - Example: notebooks/lancedb_reranking.ipynb - - Filtering: sql.md - - Versioning & Reproducibility: - - sync API: notebooks/reproducibility.ipynb - - async API: notebooks/reproducibility_async.ipynb - - Configuring Storage: guides/storage.md - - Migration Guide: migration.md - - Tuning retrieval performance: - - Choosing right query type: guides/tuning_retrievers/1_query_types.md - - Reranking: guides/tuning_retrievers/2_reranking.md - - Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md - - 🧬 Managing embeddings: - - Understand Embeddings: embeddings/understanding_embeddings.md - - Get Started: embeddings/index.md - - Embedding functions: embeddings/embedding_functions.md - - Available models: - - Overview: embeddings/default_embedding_functions.md - - Text Embedding Functions: - - Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md - - Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md - - Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md - - OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md - - Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md - - Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md - - Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md - - Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md - - AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md - - IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md - - Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md - - Multimodal Embedding Functions: - - OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md - - Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md - - Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md - - User-defined embedding functions: embeddings/custom_embedding_function.md - - Variables and secrets: embeddings/variables_and_secrets.md - - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb - - 🔌 Integrations: - - Tools and data formats: integrations/index.md - - Pandas and PyArrow: python/pandas_and_pyarrow.md - - Polars: python/polars_arrow.md - - DuckDB: python/duckdb.md - - Datafusion: python/datafusion.md - - LangChain: - - LangChain 🔗: integrations/langchain.md - - LangChain demo: notebooks/langchain_demo.ipynb - - LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb - - LlamaIndex 🦙: - - LlamaIndex docs: integrations/llamaIndex.md - - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb - - Pydantic: python/pydantic.md - - Voxel51: integrations/voxel51.md - - PromptTools: integrations/prompttools.md - - dlt: integrations/dlt.md - - phidata: integrations/phidata.md - - Genkit: integrations/genkit.md - - 🎯 Examples: - - Overview: examples/index.md - - 🐍 Python: - - Overview: examples/examples_python.md - - Build From Scratch: examples/python_examples/build_from_scratch.md - - Multimodal: examples/python_examples/multimodal.md - - Rag: examples/python_examples/rag.md - - Vector Search: examples/python_examples/vector_search.md - - Chatbot: examples/python_examples/chatbot.md - - Evaluation: examples/python_examples/evaluations.md - - AI Agent: examples/python_examples/aiagent.md - - Recommender System: examples/python_examples/recommendersystem.md - - Miscellaneous: - - Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md - - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md - - 👾 JavaScript: - - Overview: examples/examples_js.md - - Serverless Website Chatbot: examples/serverless_website_chatbot.md - - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md - - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - - 🦀 Rust: - - Overview: examples/examples_rust.md - - 📓 Studies: - - ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/ - - 💭 FAQs: faq.md - - 🔍 Troubleshooting: troubleshooting.md - - ⚙️ API reference: - - 🐍 Python: python/python.md - - 👾 JavaScript (vectordb): javascript/modules.md - - 👾 JavaScript (lancedb): js/globals.md - - 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/ - - - Quick start: basic.md - - Concepts: - - Vector search: concepts/vector_search.md - - Indexing: - - IVFPQ: concepts/index_ivfpq.md - - HNSW: concepts/index_hnsw.md - - Storage: concepts/storage.md - - Data management: concepts/data_management.md - - Guides: - - Working with tables: guides/tables.md - - Working with SQL: guides/sql_querying.md - - Building an ANN index: ann_indexes.md - - Vector Search: search.md - - Full-text search (native): fts.md - - Full-text search (tantivy-based): fts_tantivy.md - - Building a scalar index: guides/scalar_index.md - - Hybrid search: - - Overview: hybrid_search/hybrid_search.md - - Comparing Rerankers: hybrid_search/eval.md - - Airbnb financial data example: notebooks/hybrid_search.ipynb - - Late interaction with MultiVector search: - - Overview: guides/multi-vector.md - - Document search Example: notebooks/Multivector_on_LanceDB.ipynb - - RAG: - - Vanilla RAG: rag/vanilla_rag.md - - Multi-head RAG: rag/multi_head_rag.md - - Corrective RAG: rag/corrective_rag.md - - Agentic RAG: rag/agentic_rag.md - - Graph RAG: rag/graph_rag.md - - Self RAG: rag/self_rag.md - - Adaptive RAG: rag/adaptive_rag.md - - SFR RAG: rag/sfr_rag.md - - Advanced Techniques: - - HyDE: rag/advanced_techniques/hyde.md - - FLARE: rag/advanced_techniques/flare.md - - Reranking: - - Quickstart: reranking/index.md - - Cohere Reranker: reranking/cohere.md - - Linear Combination Reranker: reranking/linear_combination.md - - Reciprocal Rank Fusion Reranker: reranking/rrf.md - - Cross Encoder Reranker: reranking/cross_encoder.md - - ColBERT Reranker: reranking/colbert.md - - Jina Reranker: reranking/jina.md - - OpenAI Reranker: reranking/openai.md - - AnswerDotAi Rerankers: reranking/answerdotai.md - - Building Custom Rerankers: reranking/custom_reranker.md - - Example: notebooks/lancedb_reranking.ipynb - - Filtering: sql.md - - Versioning & Reproducibility: - - sync API: notebooks/reproducibility.ipynb - - async API: notebooks/reproducibility_async.ipynb - - Configuring Storage: guides/storage.md - - Migration Guide: migration.md - - Tuning retrieval performance: - - Choosing right query type: guides/tuning_retrievers/1_query_types.md - - Reranking: guides/tuning_retrievers/2_reranking.md - - Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md - - Managing Embeddings: - - Understand Embeddings: embeddings/understanding_embeddings.md - - Get Started: embeddings/index.md - - Embedding functions: embeddings/embedding_functions.md - - Available models: - - Overview: embeddings/default_embedding_functions.md - - Text Embedding Functions: - - Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md - - Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md - - Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md - - OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md - - Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md - - Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md - - Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md - - Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md - - AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md - - IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md - - Multimodal Embedding Functions: - - OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md - - Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md - - Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md - - User-defined embedding functions: embeddings/custom_embedding_function.md - - Variables and secrets: embeddings/variables_and_secrets.md - - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb - - Integrations: - - Overview: integrations/index.md - - Pandas and PyArrow: python/pandas_and_pyarrow.md - - Polars: python/polars_arrow.md - - DuckDB: python/duckdb.md - - Datafusion: python/datafusion.md - - LangChain 🦜️🔗↗: integrations/langchain.md - - LangChain.js 🦜️🔗↗: https://js.langchain.com/docs/integrations/vectorstores/lancedb - - LlamaIndex 🦙↗: integrations/llamaIndex.md - - Pydantic: python/pydantic.md - - Voxel51: integrations/voxel51.md - - PromptTools: integrations/prompttools.md - - dlt: integrations/dlt.md - - phidata: integrations/phidata.md - - Genkit: integrations/genkit.md - - Examples: - - examples/index.md - - 🐍 Python: - - Overview: examples/examples_python.md - - Build From Scratch: examples/python_examples/build_from_scratch.md - - Multimodal: examples/python_examples/multimodal.md - - Rag: examples/python_examples/rag.md - - Vector Search: examples/python_examples/vector_search.md - - Chatbot: examples/python_examples/chatbot.md - - Evaluation: examples/python_examples/evaluations.md - - AI Agent: examples/python_examples/aiagent.md - - Recommender System: examples/python_examples/recommendersystem.md - - Miscellaneous: - - Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md - - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md - - 👾 JavaScript: - - Overview: examples/examples_js.md - - Serverless Website Chatbot: examples/serverless_website_chatbot.md - - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md - - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - - 🦀 Rust: - - Overview: examples/examples_rust.md - - Studies: - - studies/overview.md - - ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/ - API reference: - - Overview: api_reference.md + - Overview: index.md - Python: python/python.md - - Javascript (vectordb): javascript/modules.md - - Javascript (lancedb): js/globals.md + - Javascript/TypeScript: js/globals.md - Rust: https://docs.rs/lancedb/latest/lancedb/index.html extra_css: @@ -390,7 +130,6 @@ extra_css: - styles/extra.css extra_javascript: - - "extra_js/init_ask_ai_widget.js" - "extra_js/reo.js" extra: diff --git a/docs/overrides/partials/header.html b/docs/overrides/partials/header.html deleted file mode 100644 index e82311bc..00000000 --- a/docs/overrides/partials/header.html +++ /dev/null @@ -1,255 +0,0 @@ - -
-

- This documentation site is deprecated. - Please visit our new documentation site at - lancedb.com/docs for the latest information. -

-
-{% set class = "md-header" %} -{% if "navigation.tabs.sticky" in features %} - {% set class = class ~ " md-header--shadow md-header--lifted" %} -{% elif "navigation.tabs" not in features %} - {% set class = class ~ " md-header--shadow" %} -{% endif %} - - -
- - - - {% if "navigation.tabs.sticky" in features %} - {% if "navigation.tabs" in features %} - {% include "partials/tabs.html" %} - {% endif %} - {% endif %} -
- - \ No newline at end of file diff --git a/docs/overrides/partials/main.html b/docs/overrides/partials/main.html deleted file mode 100644 index e3f98827..00000000 --- a/docs/overrides/partials/main.html +++ /dev/null @@ -1,5 +0,0 @@ -{% extends "base.html" %} - -{% block announce %} - 📚 Starting June 1st, 2025, please use lancedb.github.io/documentation for the latest docs. -{% endblock %} \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 8179ec1d..9011daff 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,8 +1,9 @@ mkdocs==1.5.3 mkdocs-jupyter==0.24.1 mkdocs-material==9.5.3 +mkdocs-autorefs<=1.0 mkdocstrings[python]==0.25.2 griffe mkdocs-render-swagger-plugin pydantic -mkdocs-redirects +mkdocs-redirects \ No newline at end of file diff --git a/docs/src/robots.txt b/docs/robots.txt similarity index 100% rename from docs/src/robots.txt rename to docs/robots.txt diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md deleted file mode 100644 index 4aa2678f..00000000 --- a/docs/src/ann_indexes.md +++ /dev/null @@ -1,307 +0,0 @@ -# Approximate Nearest Neighbor (ANN) Indexes - -An ANN or a vector index is a data structure specifically designed to efficiently organize and -search vector data based on their similarity via the chosen distance metric. -By constructing a vector index, the search space is effectively narrowed down, avoiding the need -for brute-force scanning of the entire vector space. -A vector index is faster but less accurate than exhaustive search (kNN or flat search). -LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results. - -## Disk-based Index - -Lance provides an `IVF_PQ` disk-based index. It uses **Inverted File Index (IVF)** to first divide -the dataset into `N` partitions, and then applies **Product Quantization** to compress vectors in each partition. -See the [indexing](concepts/index_ivfpq.md) concepts guide for more information on how this works. - -## Creating an IVF_PQ Index - -Lance supports `IVF_PQ` index type by default. - -=== "Python" - === "Sync API" - - Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) method. - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:import-numpy" - --8<-- "python/python/tests/docs/test_guide_index.py:create_ann_index" - ``` - === "Async API" - Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) method. - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:import-numpy" - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb-ivfpq" - --8<-- "python/python/tests/docs/test_guide_index.py:create_ann_index_async" - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - Creating indexes is done via the [lancedb.Table.createIndex](../js/classes/Table.md/#createIndex) method. - - ```typescript - --8<--- "nodejs/examples/ann_indexes.test.ts:import" - - --8<-- "nodejs/examples/ann_indexes.test.ts:ingest" - ``` - - === "vectordb (deprecated)" - - Creating indexes is done via the [lancedb.Table.createIndex](../javascript/interfaces/Table.md/#createIndex) method. - - ```typescript - --8<--- "docs/src/ann_indexes.ts:import" - - --8<-- "docs/src/ann_indexes.ts:ingest" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/ivf_pq.rs:create_index" - ``` - - IVF_PQ index parameters are more fully defined in the [crate docs](https://docs.rs/lancedb/latest/lancedb/index/vector/struct.IvfPqIndexBuilder.html). - -The following IVF_PQ paramters can be specified: - -- **distance_type**: The distance metric to use. By default it uses euclidean distance "`l2`". - We also support "cosine" and "dot" distance as well. -- **num_partitions**: The number of partitions in the index. The default is the square root - of the number of rows. - -!!! note - - In the synchronous python SDK and node's `vectordb` the default is 256. This default has - changed in the asynchronous python SDK and node's `lancedb`. - -- **num_sub_vectors**: The number of sub-vectors (M) that will be created during Product Quantization (PQ). - For D dimensional vector, it will be divided into `M` subvectors with dimension `D/M`, each of which is replaced by - a single PQ code. The default is the dimension of the vector divided by 16. -- **num_bits**: The number of bits used to encode each sub-vector. Only 4 and 8 are supported. The higher the number of bits, the higher the accuracy of the index, also the slower search. The default is 8. - -!!! note - - In the synchronous python SDK and node's `vectordb` the default is currently 96. This default has - changed in the asynchronous python SDK and node's `lancedb`. - -
- ![IVF PQ](./assets/ivf_pq.png) -
IVF_PQ index with num_partitions=2, num_sub_vectors=4
-
- -### Use GPU to build vector index - -Lance Python SDK has experimental GPU support for creating IVF index. -Using GPU for index creation requires [PyTorch>2.0](https://pytorch.org/) being installed. - -You can specify the GPU device to train IVF partitions via - -- **accelerator**: Specify to `cuda` or `mps` (on Apple Silicon) to enable GPU training. - -=== "Linux" - - - ``` { .python .copy } - # Create index using CUDA on Nvidia GPUs. - tbl.create_index( - num_partitions=256, - num_sub_vectors=96, - accelerator="cuda" - ) - ``` - -=== "MacOS" - - - ```python - # Create index using MPS on Apple Silicon. - tbl.create_index( - num_partitions=256, - num_sub_vectors=96, - accelerator="mps" - ) - ``` -!!! note - GPU based indexing is not yet supported with our asynchronous client. - -Troubleshooting: - -If you see `AssertionError: Torch not compiled with CUDA enabled`, you need to [install -PyTorch with CUDA support](https://pytorch.org/get-started/locally/). - -## Querying an ANN Index - -Querying vector indexes is done via the [search](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.search) function. - -There are a couple of parameters that can be used to fine-tune the search: - -- **limit** (default: 10): The amount of results that will be returned -- **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.
- Most of the time, setting nprobes to cover 5-15% of the dataset should achieve high recall with low latency.
- - _For example_, For a dataset of 1 million vectors divided into 256 partitions, `nprobes` should be set to ~20-40. This value can be adjusted to achieve the optimal balance between search latency and search quality.
- -- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory.
- A higher number makes search more accurate but also slower. If you find the recall is less than ideal, try refine_factor=10 to start.
- - _For example_, For a dataset of 1 million vectors divided into 256 partitions, setting the `refine_factor` to 200 will initially retrieve the top 4,000 candidates (top k * refine_factor) from all searched partitions. These candidates are then reranked to determine the final top 20 results.
-!!! note - Both `nprobes` and `refine_factor` are only applicable if an ANN index is present. If specified on a table without an ANN index, those parameters are ignored. - - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_async" - ``` - - ```text - vector item _distance - 0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333 - 1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867 - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/ann_indexes.test.ts:search1" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/ann_indexes.ts:search1" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/ivf_pq.rs:search1" - ``` - - Vector search options are more fully defined in the [crate docs](https://docs.rs/lancedb/latest/lancedb/query/struct.Query.html#method.nearest_to). - -The search will return the data requested in addition to the distance of each item. - -### Filtering (where clause) - -You can further filter the elements returned by a search using a where clause. - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_with_filter" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_async_with_filter" - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/ann_indexes.test.ts:search2" - ``` - - === "vectordb (deprecated)" - - ```javascript - --8<-- "docs/src/ann_indexes.ts:search2" - ``` - -### Projections (select clause) - -You can select the columns returned by the query using a select clause. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_with_select" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_async_with_select" - ``` - - ```text - vector _distance - 0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092 - 1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485 - ... - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/ann_indexes.test.ts:search3" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/ann_indexes.ts:search3" - ``` - -## FAQ - -### Why do I need to manually create an index? - -Currently, LanceDB does _not_ automatically create the ANN index. -LanceDB is well-optimized for kNN (exhaustive search) via a disk-based index. For many use-cases, -datasets of the order of ~100K vectors don't require index creation. If you can live with up to -100ms latency, skipping index creation is a simpler workflow while guaranteeing 100% recall. - -### When is it necessary to create an ANN vector index? - -`LanceDB` comes out-of-the-box with highly optimized SIMD code for computing vector similarity. -In our benchmarks, computing distances for 100K pairs of 1K dimension vectors takes **less than 20ms**. -We observe that for small datasets (~100K rows) or for applications that can accept 100ms latency, -vector indices are usually not necessary. - -For large-scale or higher dimension vectors, it can beneficial to create vector index for performance. - -### How big is my index, and how many memory will it take? - -In LanceDB, all vector indices are **disk-based**, meaning that when responding to a vector query, only the relevant pages from the index file are loaded from disk and cached in memory. Additionally, each sub-vector is usually encoded into 1 byte PQ code. - -For example, with a 1024-dimension dataset, if we choose `num_sub_vectors=64`, each sub-vector has `1024 / 64 = 16` float32 numbers. -Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` times of space reduction. - -### How to choose `num_partitions` and `num_sub_vectors` for `IVF_PQ` index? - -`num_partitions` is used to decide how many partitions the first level `IVF` index uses. -Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train. -On `SIFT-1M` dataset, our benchmark shows that keeping each partition 4K-8K rows lead to a good latency / recall. - -`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because -PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in -less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency. - -!!! note - if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero` - -### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index? - -`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption. - -`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase diff --git a/docs/src/ann_indexes.ts b/docs/src/ann_indexes.ts deleted file mode 100644 index 0276daad..00000000 --- a/docs/src/ann_indexes.ts +++ /dev/null @@ -1,54 +0,0 @@ -// --8<-- [start:import] -import * as vectordb from "vectordb"; -// --8<-- [end:import] - -(async () => { - console.log("ann_indexes.ts: start"); - // --8<-- [start:ingest] - const db = await vectordb.connect("data/sample-lancedb"); - - let data = []; - for (let i = 0; i < 10_000; i++) { - data.push({ - vector: Array(1536).fill(i), - id: `${i}`, - content: "", - longId: `${i}`, - }); - } - const table = await db.createTable("my_vectors", data); - await table.createIndex({ - type: "ivf_pq", - column: "vector", - num_partitions: 16, - num_sub_vectors: 48, - }); - // --8<-- [end:ingest] - - // --8<-- [start:search1] - const results_1 = await table - .search(Array(1536).fill(1.2)) - .limit(2) - .nprobes(20) - .refineFactor(10) - .execute(); - // --8<-- [end:search1] - - // --8<-- [start:search2] - const results_2 = await table - .search(Array(1536).fill(1.2)) - .where("id != '1141'") - .limit(2) - .execute(); - // --8<-- [end:search2] - - // --8<-- [start:search3] - const results_3 = await table - .search(Array(1536).fill(1.2)) - .select(["id"]) - .limit(2) - .execute(); - // --8<-- [end:search3] - - console.log("ann_indexes.ts: done"); -})(); diff --git a/docs/src/api_reference.md b/docs/src/api_reference.md deleted file mode 100644 index 0eb22a83..00000000 --- a/docs/src/api_reference.md +++ /dev/null @@ -1,8 +0,0 @@ -# API Reference - -The API reference for the LanceDB client SDKs are available at the following locations: - -- [Python](python/python.md) -- [JavaScript (legacy vectordb package)](javascript/modules.md) -- [JavaScript (newer @lancedb/lancedb package)](js/globals.md) -- [Rust](https://docs.rs/lancedb/latest/lancedb/index.html) diff --git a/docs/src/basic.md b/docs/src/basic.md deleted file mode 100644 index 625368ce..00000000 --- a/docs/src/basic.md +++ /dev/null @@ -1,655 +0,0 @@ -# Quick start - -!!! info "LanceDB can be run in a number of ways:" - - * Embedded within an existing backend (like your Django, Flask, Node.js or FastAPI application) - * Directly from a client application like a Jupyter notebook for analytical workloads - * Deployed as a remote serverless database - -![](assets/lancedb_embedded_explanation.png) - -## Installation - -=== "Python" - - ```shell - pip install lancedb - ``` - -=== "Typescript[^1]" - === "@lancedb/lancedb" - - ```shell - npm install @lancedb/lancedb - ``` - !!! note "Bundling `@lancedb/lancedb` apps with Webpack" - - Since LanceDB contains a prebuilt Node binary, you must configure `next.config.js` to exclude it from webpack. This is required for both using Next.js and deploying a LanceDB app on Vercel. - - ```javascript - /** @type {import('next').NextConfig} */ - module.exports = ({ - webpack(config) { - config.externals.push({ '@lancedb/lancedb': '@lancedb/lancedb' }) - return config; - } - }) - ``` - - !!! note "Yarn users" - - Unlike other package managers, Yarn does not automatically resolve peer dependencies. If you are using Yarn, you will need to manually install 'apache-arrow': - - ```shell - yarn add apache-arrow - ``` - - === "vectordb (deprecated)" - - ```shell - npm install vectordb - ``` - !!! note "Bundling `vectordb` apps with Webpack" - - Since LanceDB contains a prebuilt Node binary, you must configure `next.config.js` to exclude it from webpack. This is required for both using Next.js and deploying a LanceDB app on Vercel. - - ```javascript - /** @type {import('next').NextConfig} */ - module.exports = ({ - webpack(config) { - config.externals.push({ vectordb: 'vectordb' }) - return config; - } - }) - ``` - - !!! note "Yarn users" - - Unlike other package managers, Yarn does not automatically resolve peer dependencies. If you are using Yarn, you will need to manually install 'apache-arrow': - - ```shell - yarn add apache-arrow - ``` - -=== "Rust" - - ```shell - cargo add lancedb - ``` - - !!! info "To use the lancedb create, you first need to install protobuf." - - === "macOS" - - ```shell - brew install protobuf - ``` - - === "Ubuntu/Debian" - - ```shell - sudo apt install -y protobuf-compiler libssl-dev - ``` - - !!! info "Please also make sure you're using the same version of Arrow as in the [lancedb crate](https://github.com/lancedb/lancedb/blob/main/Cargo.toml)" - -### Preview releases - -Stable releases are created about every 2 weeks. For the latest features and bug -fixes, you can install the preview release. These releases receive the same -level of testing as stable releases, but are not guaranteed to be available for -more than 6 months after they are released. Once your application is stable, we -recommend switching to stable releases. - -=== "Python" - - ```shell - pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ lancedb - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```shell - npm install @lancedb/lancedb@preview - ``` - === "vectordb (deprecated)" - - ```shell - npm install vectordb@preview - ``` - -=== "Rust" - - We don't push preview releases to crates.io, but you can referent the tag - in GitHub within your Cargo dependencies: - - ```toml - [dependencies] - lancedb = { git = "https://github.com/lancedb/lancedb.git", tag = "vX.Y.Z-beta.N" } - ``` - -## Connect to a database - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:imports" - - --8<-- "python/python/tests/docs/test_basic.py:set_uri" - --8<-- "python/python/tests/docs/test_basic.py:connect" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:imports" - - --8<-- "python/python/tests/docs/test_basic.py:set_uri" - --8<-- "python/python/tests/docs/test_basic.py:connect_async" - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - import * as lancedb from "@lancedb/lancedb"; - import * as arrow from "apache-arrow"; - - --8<-- "nodejs/examples/basic.test.ts:connect" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:open_db" - ``` - -=== "Rust" - - ```rust - #[tokio::main] - async fn main() -> Result<()> { - --8<-- "rust/lancedb/examples/simple.rs:connect" - } - ``` - - !!! info "See [examples/simple.rs](https://github.com/lancedb/lancedb/tree/main/rust/lancedb/examples/simple.rs) for a full working example." - -LanceDB will create the directory if it doesn't exist (including parent directories). - -If you need a reminder of the uri, you can call `db.uri()`. - -## Create a table - -### Create a table from initial data - -If you have data to insert into the table at creation time, you can simultaneously create a -table and insert the data into it. The schema of the data will be used as the schema of the -table. - -=== "Python" - - If the table already exists, LanceDB will raise an error by default. - If you want to overwrite the table, you can pass in `mode="overwrite"` - to the `create_table` method. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_table" - ``` - - You can also pass in a pandas DataFrame directly: - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_table_pandas" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_table_async" - ``` - - You can also pass in a pandas DataFrame directly: - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_table_async_pandas" - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:create_table" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:create_table" - ``` - - If the table already exists, LanceDB will raise an error by default. - If you want to overwrite the table, you can pass in `mode:"overwrite"` - to the `createTable` function. - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:create_table" - ``` - - If the table already exists, LanceDB will raise an error by default. See - [the mode option](https://docs.rs/lancedb/latest/lancedb/connection/struct.CreateTableBuilder.html#method.mode) - for details on how to overwrite (or open) existing tables instead. - - !!! Providing table records in Rust - - The Rust SDK currently expects data to be provided as an Arrow - [RecordBatchReader](https://docs.rs/arrow-array/latest/arrow_array/trait.RecordBatchReader.html) - Support for additional formats (such as serde or polars) is on the roadmap. - -!!! info "Under the hood, LanceDB reads in the Apache Arrow data and persists it to disk using the [Lance format](https://www.github.com/lancedb/lance)." - -!!! info "Automatic embedding generation with Embedding API" - When working with embedding models, it is recommended to use the LanceDB embedding API to automatically create vector representation of the data and queries in the background. See the [quickstart example](#using-the-embedding-api) or the embedding API [guide](./embeddings/) - -### Create an empty table - -Sometimes you may not have the data to insert into the table at creation time. -In this case, you can create an empty table and specify the schema, so that you can add -data to the table at a later time (as long as it conforms to the schema). This is -similar to a `CREATE TABLE` statement in SQL. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_empty_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_empty_table_async" - ``` - - !!! note "You can define schema in Pydantic" - LanceDB comes with Pydantic support, which allows you to define the schema of your data using Pydantic models. This makes it easy to work with LanceDB tables and data. Learn more about all supported types in [tables guide](./guides/tables.md). - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:create_empty_table" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:create_empty_table" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:create_empty_table" - ``` - -## Open an existing table - -Once created, you can open a table as follows: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:open_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:open_table_async" - ``` - -=== "Typescript[^1]" - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:open_table" - ``` - - === "vectordb (deprecated)" - - ```typescript - const tbl = await db.openTable("myTable"); - ``` - - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:open_existing_tbl" - ``` - -If you forget the name of your table, you can always get a listing of all table names: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:table_names" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:table_names_async" - ``` - -=== "Typescript[^1]" - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:table_names" - ``` - - === "vectordb (deprecated)" - - ```typescript - console.log(await db.tableNames()); - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:list_names" - ``` - -## Add data to a table - -After a table has been created, you can always add more data to it as follows: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:add_data" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:add_data_async" - ``` - -=== "Typescript[^1]" - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:add_data" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:add" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:add" - ``` - -## Search for nearest neighbors - -Once you've embedded the query, you can find its nearest neighbors as follows: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:vector_search" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:vector_search_async" - ``` - - This returns a pandas DataFrame with the results. - -=== "Typescript[^1]" - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:vector_search" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:search" - ``` - -=== "Rust" - - ```rust - use futures::TryStreamExt; - - --8<-- "rust/lancedb/examples/simple.rs:search" - ``` - - !!! Query vectors in Rust - Rust does not yet support automatic execution of embedding functions. You will need to - calculate embeddings yourself. Support for this is on the roadmap and can be tracked at - https://github.com/lancedb/lancedb/issues/994 - - Query vectors can be provided as Arrow arrays or a Vec/slice of Rust floats. - Support for additional formats (e.g. `polars::series::Series`) is on the roadmap. - -By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). -For tables with more than 50K vectors, creating an ANN index is recommended to speed up search performance. -LanceDB allows you to create an ANN index on a table as follows: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_index" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:create_index_async" - ``` - -=== "Typescript[^1]" - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:create_index" - ``` - - === "vectordb (deprecated)" - - ```{.typescript .ignore} - --8<-- "docs/src/basic_legacy.ts:create_index" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:create_index" - ``` - -!!! note "Why do I need to create an index manually?" -LanceDB does not automatically create the ANN index for two reasons. The first is that it's optimized -for really fast retrievals via a disk-based index, and the second is that data and query workloads can -be very diverse, so there's no one-size-fits-all index configuration. LanceDB provides many parameters -to fine-tune index size, query latency and accuracy. See the section on -[ANN indexes](ann_indexes.md) for more details. - -## Delete rows from a table - -Use the `delete()` method on tables to delete rows from a table. To choose -which rows to delete, provide a filter that matches on the metadata columns. -This can delete any number of rows that match the filter. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:delete_rows" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:delete_rows_async" - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:delete_rows" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:delete" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:delete" - ``` - -The deletion predicate is a SQL expression that supports the same expressions -as the `where()` clause (`only_if()` in Rust) on a search. They can be as -simple or complex as needed. To see what expressions are supported, see the -[SQL filters](sql.md) section. - -=== "Python" - - === "Sync API" - Read more: [lancedb.table.Table.delete][] - === "Async API" - Read more: [lancedb.table.AsyncTable.delete][] - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - Read more: [lancedb.Table.delete](javascript/interfaces/Table.md#delete) - - === "vectordb (deprecated)" - - Read more: [vectordb.Table.delete](javascript/interfaces/Table.md#delete) - -=== "Rust" - - Read more: [lancedb::Table::delete](https://docs.rs/lancedb/latest/lancedb/table/struct.Table.html#method.delete) - -## Drop a table - -Use the `drop_table()` method on the database to remove a table. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_table_async" - ``` - - This permanently removes the table and is not recoverable, unlike deleting rows. - By default, if the table does not exist an exception is raised. To suppress this, - you can pass in `ignore_missing=True`. - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:drop_table" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:drop_table" - ``` - - This permanently removes the table and is not recoverable, unlike deleting rows. - If the table does not exist an exception is raised. - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/simple.rs:drop_table" - ``` - - -## Using the Embedding API -You can use the embedding API when working with embedding models. It automatically vectorizes the data at ingestion and query time and comes with built-in integrations with popular embedding models like Openai, Hugging Face, Sentence Transformers, CLIP and more. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_embeddings_optional.py:imports" - - --8<-- "python/python/tests/docs/test_embeddings_optional.py:openai_embeddings" - ``` - === "Async API" - - Coming soon to the async API. - https://github.com/lancedb/lancedb/issues/1938 - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/embedding.test.ts:imports" - --8<-- "nodejs/examples/embedding.test.ts:openai_embeddings" - ``` - -=== "Rust" - - ```rust - --8<-- "rust/lancedb/examples/openai.rs:imports" - --8<-- "rust/lancedb/examples/openai.rs:openai_embeddings" - ``` - -Learn about using the existing integrations and creating custom embedding functions in the [embedding API guide](./embeddings/index.md). - - -## What's next - -This section covered the very basics of using LanceDB. If you're learning about vector databases for the first time, you may want to read the page on [indexing](concepts/index_ivfpq.md) to get familiar with the concepts. - -If you've already worked with other vector databases, you may want to read the [guides](guides/tables.md) to learn how to work with LanceDB in more detail. - -[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](migration.md) for more information. diff --git a/docs/src/basic_legacy.ts b/docs/src/basic_legacy.ts deleted file mode 100644 index a3a71be8..00000000 --- a/docs/src/basic_legacy.ts +++ /dev/null @@ -1,126 +0,0 @@ -// --8<-- [start:import] -import * as lancedb from "vectordb"; -import { - Schema, - Field, - Float32, - FixedSizeList, - Int32, - Float16, -} from "apache-arrow"; -import * as arrow from "apache-arrow"; -// --8<-- [end:import] -import * as fs from "fs"; -import { Table as ArrowTable, Utf8 } from "apache-arrow"; - -const example = async () => { - fs.rmSync("data/sample-lancedb", { recursive: true, force: true }); - // --8<-- [start:open_db] - const lancedb = require("vectordb"); - const uri = "data/sample-lancedb"; - const db = await lancedb.connect(uri); - // --8<-- [end:open_db] - - // --8<-- [start:create_table] - const tbl = await db.createTable( - "myTable", - [ - { vector: [3.1, 4.1], item: "foo", price: 10.0 }, - { vector: [5.9, 26.5], item: "bar", price: 20.0 }, - ], - { writeMode: lancedb.WriteMode.Overwrite }, - ); - // --8<-- [end:create_table] - { - // --8<-- [start:create_table_with_schema] - const schema = new arrow.Schema([ - new arrow.Field( - "vector", - new arrow.FixedSizeList( - 2, - new arrow.Field("item", new arrow.Float32(), true), - ), - ), - new arrow.Field("item", new arrow.Utf8(), true), - new arrow.Field("price", new arrow.Float32(), true), - ]); - const data = [ - { vector: [3.1, 4.1], item: "foo", price: 10.0 }, - { vector: [5.9, 26.5], item: "bar", price: 20.0 }, - ]; - const tbl = await db.createTable({ - name: "myTableWithSchema", - data, - schema, - }); - // --8<-- [end:create_table_with_schema] - } - - // --8<-- [start:add] - const newData = Array.from({ length: 500 }, (_, i) => ({ - vector: [i, i + 1], - item: "fizz", - price: i * 0.1, - })); - await tbl.add(newData); - // --8<-- [end:add] - - // --8<-- [start:create_index] - await tbl.createIndex({ - type: "ivf_pq", - num_partitions: 2, - num_sub_vectors: 2, - }); - // --8<-- [end:create_index] - - // --8<-- [start:create_empty_table] - const schema = new arrow.Schema([ - new arrow.Field("id", new arrow.Int32()), - new arrow.Field("name", new arrow.Utf8()), - ]); - - const empty_tbl = await db.createTable({ name: "empty_table", schema }); - // --8<-- [end:create_empty_table] - { - // --8<-- [start:create_f16_table] - const dim = 16; - const total = 10; - const schema = new Schema([ - new Field("id", new Int32()), - new Field( - "vector", - new FixedSizeList(dim, new Field("item", new Float16(), true)), - false, - ), - ]); - const data = lancedb.makeArrowTable( - Array.from(Array(total), (_, i) => ({ - id: i, - vector: Array.from(Array(dim), Math.random), - })), - { schema }, - ); - const table = await db.createTable("f16_tbl", data); - // --8<-- [end:create_f16_table] - } - - // --8<-- [start:search] - const query = await tbl.search([100, 100]).limit(2).execute(); - // --8<-- [end:search] - - // --8<-- [start:delete] - await tbl.delete('item = "fizz"'); - // --8<-- [end:delete] - - // --8<-- [start:drop_table] - await db.dropTable("myTable"); - // --8<-- [end:drop_table] -}; - -async function main() { - console.log("basic_legacy.ts: start"); - await example(); - console.log("basic_legacy.ts: done"); -} - -main(); diff --git a/docs/src/cloud/cloud_faq.md b/docs/src/cloud/cloud_faq.md deleted file mode 100644 index 7c49b9d6..00000000 --- a/docs/src/cloud/cloud_faq.md +++ /dev/null @@ -1,34 +0,0 @@ -This section provides answers to the most common questions asked about LanceDB Cloud. By following these guidelines, you can ensure a smooth, performant experience with LanceDB Cloud. - -### Should I reuse the database connection? -Yes! It is recommended to establish a single database connection and maintain it throughout your interaction with the tables within. - -LanceDB uses HTTP connections to communicate with the servers. By re-using the Connection object, you avoid the overhead of repeatedly establishing HTTP connections, significantly improving efficiency. - -### Should I re-use the `Table` object? -`table = db.open_table()` should be called once and used for all subsequent table operations. If there are changes to the opened table, `table` always reflect the **latest version** of the data. - -### What should I do if I need to search for rows by `id`? -LanceDB Cloud currently does not support an ID or primary key column. You are recommended to add a -user-defined ID column. To significantly improve the query performance with SQL causes, a scalar BITMAP/BTREE index should be created on this column. - -### What are the vector indexing types supported by LanceDB Cloud? -We support `IVF_PQ` and `IVF_HNSW_SQ` as the `index_type` which is passed to `create_index`. LanceDB Cloud tunes the indexing parameters automatically to achieve the best tradeoff between query latency and query quality. - -### When I add new rows to a table, do I need to manually update the index? -No! LanceDB Cloud triggers an asynchronous background job to index the new vectors. - -Even though indexing is asynchronous, your vectors will still be immediately searchable. LanceDB uses brute-force search to search over unindexed rows. This makes you new data is immediately available, but does increase latency temporarily. To disable the brute-force part of search, set the `fast_search` flag in your query to `true`. - -### Do I need to reindex the whole dataset if only a small portion of the data is deleted or updated? -No! Similar to adding data to the table, LanceDB Cloud triggers an asynchronous background job to update the existing indices. Therefore, no action is needed from users and there is absolutely no -downtime expected. - -### How do I know whether an index has been created? -While index creation in LanceDB Cloud is generally fast, querying immediately after a `create_index` call may result in errors. It's recommended to use `list_indices` to verify index creation before querying. - -### Why is my query latency higher than expected? -Multiple factors can impact query latency. To reduce query latency, consider the following: -- Send pre-warm queries: send a few queries to warm up the cache before an actual user query. -- Check network latency: LanceDB Cloud is hosted in AWS `us-east-1` region. It is recommended to run queries from an EC2 instance that is in the same region. -- Create scalar indices: If you are filtering on metadata, it is recommended to create scalar indices on those columns. This will speedup searches with metadata filtering. See [here](../guides/scalar_index.md) for more details on creating a scalar index. \ No newline at end of file diff --git a/docs/src/cloud/index.md b/docs/src/cloud/index.md deleted file mode 100644 index fa2dc627..00000000 --- a/docs/src/cloud/index.md +++ /dev/null @@ -1,17 +0,0 @@ -# About LanceDB Cloud - -LanceDB Cloud is a SaaS (software-as-a-service) solution that runs serverless in the cloud, clearly separating storage from compute. It's designed to be highly scalable without breaking the bank. LanceDB Cloud is currently in private beta with general availability coming soon, but you can apply for early access with the private beta release by signing up below. - -[Try out LanceDB Cloud (Public Beta)](https://cloud.lancedb.com){ .md-button .md-button--primary } - -## Architecture - -LanceDB Cloud provides the same underlying fast vector store that powers the OSS version, but without the need to maintain your own infrastructure. Because it's serverless, you only pay for the storage you use, and you can scale compute up and down as needed depending on the size of your data and its associated index. - -![](../assets/lancedb_cloud.png) - -## Transitioning from the OSS to the Cloud version - -The OSS version of LanceDB is designed to be embedded in your application, and it runs in-process. This makes it incredibly simple to self-host your own AI retrieval workflows for RAG and more and build and test out your concepts on your own infrastructure. The OSS version is forever free, and you can continue to build and integrate LanceDB into your existing backend applications without any added costs. - -Should you decide that you need a managed deployment in production, it's possible to seamlessly transition from the OSS to the cloud version by changing the connection string to point to a remote database instead of a local one. With LanceDB Cloud, you can take your AI application from development to production without major code changes or infrastructure burden. diff --git a/docs/src/cloud/rest.md b/docs/src/cloud/rest.md deleted file mode 100644 index 12e4bd06..00000000 --- a/docs/src/cloud/rest.md +++ /dev/null @@ -1 +0,0 @@ -!!swagger ../../openapi.yml!! diff --git a/docs/src/concepts/data_management.md b/docs/src/concepts/data_management.md deleted file mode 100644 index c0d78bbf..00000000 --- a/docs/src/concepts/data_management.md +++ /dev/null @@ -1,62 +0,0 @@ -# Data management - -This section covers concepts related to managing your data over time in LanceDB. - -## A primer on Lance - -Because LanceDB is built on top of the [Lance](https://lancedb.github.io/lance/) data format, it helps to understand some of its core ideas. Just like Apache Arrow, Lance is a fast columnar data format, but it has the added benefit of being versionable, query and train ML models on. Lance is designed to be used with simple and complex data types, like tabular data, images, videos audio, 3D point clouds (which are deeply nested) and more. - -The following concepts are important to keep in mind: - -- Data storage is columnar and is interoperable with other columnar formats (such as Parquet) via Arrow -- Data is divided into fragments that represent a subset of the data -- Data is versioned, with each insert operation creating a new version of the dataset and an update to the manifest that tracks versions via metadata - -!!! note - 1. First, each version contains metadata and just the new/updated data in your transaction. So if you have 100 versions, they aren't 100 duplicates of the same data. However, they do have 100x the metadata overhead of a single version, which can result in slower queries. - 2. Second, these versions exist to keep LanceDB scalable and consistent. We do not immediately blow away old versions when creating new ones because other clients might be in the middle of querying the old version. It's important to retain older versions for as long as they might be queried. - -## What are fragments? - -Fragments are chunks of data in a Lance dataset. Each fragment includes multiple files that contain several columns in the chunk of data that it represents. - -## Compaction - -As you insert more data, your dataset will grow and you'll need to perform *compaction* to maintain query throughput (i.e., keep latencies down to a minimum). Compaction is the process of merging fragments together to reduce the amount of metadata that needs to be managed, and to reduce the number of files that need to be opened while scanning the dataset. - -### How does compaction improve performance? - -Compaction performs the following tasks in the background: - -- Removes deleted rows from fragments -- Removes dropped columns from fragments -- Merges small fragments into larger ones - -Depending on the use case and dataset, optimal compaction will have different requirements. As a rule of thumb: - -- It’s always better to use *batch* inserts rather than adding 1 row at a time (to avoid too small fragments). If single-row inserts are unavoidable, run compaction on a regular basis to merge them into larger fragments. -- Keep the number of fragments under 100, which is suitable for most use cases (for *really* large datasets of >500M rows, more fragments might be needed) - -!!! note - - LanceDB Cloud/Enterprise supports [auto-compaction](https://docs.lancedb.com/enterprise/architecture/architecture#write-path) which automatically optimizes fragments in the background as data changes. - -## Deletion - -Although Lance allows you to delete rows from a dataset, it does not actually delete the data immediately. It simply marks the row as deleted in the `DataFile` that represents a fragment. For a given version of the dataset, each fragment can have up to one deletion file (if no rows were ever deleted from that fragment, it will not have a deletion file). This is important to keep in mind because it means that the data is still there, and can be recovered if needed, as long as that version still exists based on your backup policy. - -## Reindexing - -Reindexing is the process of updating the index to account for new data, keeping good performance for queries. This applies to either a full-text search (FTS) index or a vector index. For ANN search, new data will always be included in query results, but queries on tables with unindexed data will fallback to slower search methods for the new parts of the table. This is another important operation to run periodically as your data grows, as it also improves performance. This is especially important if you're appending large amounts of data to an existing dataset. - -!!! tip - When adding new data to a dataset that has an existing index (either FTS or vector), LanceDB doesn't immediately update the index until a reindex operation is complete. - -Both LanceDB OSS and Cloud support reindexing, but the process (at least for now) is different for each, depending on the type of index. - -In LanceDB OSS, re-indexing happens synchronously when you call either `create_index` or `optimize` on a table. In LanceDB Cloud, re-indexing happens asynchronously as you add and update data in your table. - -By default, queries will search new data even if it has yet to be indexed. This is done using brute-force methods, such as kNN for vector search, and combined with the fast index search results. This is done to ensure that you're always searching over all your data, but it does come at a performance cost. Without reindexing, adding more data to a table will make queries slower and more expensive. This behavior can be disabled by setting the [fast_search](https://lancedb.github.io/lancedb/python/python/#lancedb.query.AsyncQuery.fast_search) parameter which will instruct the query to ignore un-indexed data. - -* LanceDB Cloud/Enterprise supports [automatic incremental reindexing](https://docs.lancedb.com/core#vector-index) for vector, scalar, and FTS indices, where a background process will trigger a new index build for you automatically when new data is added or modified in a dataset -* LanceDB OSS requires you to manually trigger a reindex operation -- we are working on adding incremental reindexing to LanceDB OSS as well diff --git a/docs/src/concepts/index_hnsw.md b/docs/src/concepts/index_hnsw.md deleted file mode 100644 index ca74b2ed..00000000 --- a/docs/src/concepts/index_hnsw.md +++ /dev/null @@ -1,99 +0,0 @@ - -# Understanding HNSW index - -Approximate Nearest Neighbor (ANN) search is a method for finding data points near a given point in a dataset, though not always the exact nearest one. HNSW is one of the most accurate and fastest Approximate Nearest Neighbour search algorithms, It’s beneficial in high-dimensional spaces where finding the same nearest neighbor would be too slow and costly - -[Jump to usage](#usage) -There are three main types of ANN search algorithms: - -* **Tree-based search algorithms**: Use a tree structure to organize and store data points. -* **Hash-based search algorithms**: Use a specialized geometric hash table to store and manage data points. These algorithms typically focus on theoretical guarantees, and don't usually perform as well as the other approaches in practice. -* **Graph-based search algorithms**: Use a graph structure to store data points, which can be a bit complex. - -HNSW is a graph-based algorithm. All graph-based search algorithms rely on the idea of a k-nearest neighbor (or k-approximate nearest neighbor) graph, which we outline below. -HNSW also combines this with the ideas behind a classic 1-dimensional search data structure: the skip list. - -## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs -The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple: - -* Each vector in the dataset is given an associated vertex. -* Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex. -* For some applications (including nearest-neighbor search), the incoming edges are also added. - -Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors: - -* Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node). -* Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex. -* Repeat until a local optimum is found. - -The above algorithm also generalizes to e.g. top 10 approximate nearest neighbors. - -Computing a k-nearest neighbor graph is actually quite slow, taking quadratic time in the dataset size. It was quickly realized that near-identical performance can be achieved using a k-approximate nearest neighbor graph. That is, instead of obtaining the k-nearest neighbors for each vertex, an approximate nearest neighbor search data structure is used to build much faster. -In fact, another data structure is not needed: This can be done "incrementally". -That is, if you start with a k-ANN graph for n-1 vertices, you can extend it to a k-ANN graph for n vertices as well by using the graph to obtain the k-ANN for the new vertex. - -One downside of k-NN and k-ANN graphs alone is that one must typically build them with a large value of k to get decent results, resulting in a large index. - - -## HNSW: Hierarchical Navigable Small Worlds - -HNSW builds on k-ANN in two main ways: - -* Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant. -* The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list. - -This recursive structure can be thought of as separating into layers: - -* At the bottom-most layer, an k-ANN graph on the whole dataset is present. -* At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present. -* At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer. - -Then the greedy search routine operates as follows: - -* At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer. -* Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method. -* Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10). - - -## Usage - -There are three key parameters to set when constructing an HNSW index: - -* `metric`: Use an `l2` euclidean distance metric. We also support `dot` and `cosine` distance. -* `m`: The number of neighbors to select for each vector in the HNSW graph. -* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph. - - -We can combine the above concepts to understand how to build and query an HNSW index in LanceDB. - -### Construct index - -```python -import lancedb -import numpy as np -uri = "/tmp/lancedb" -db = lancedb.connect(uri) - -# Create 10,000 sample vectors -data = [ - {"vector": row, "item": f"item {i}"} - for i, row in enumerate(np.random.random((10_000, 1536)).astype('float32')) -] - -# Add the vectors to a table -tbl = db.create_table("my_vectors", data=data) - -# Create and train the HNSW index for a 1536-dimensional vector -# Make sure you have enough data in the table for an effective training step -tbl.create_index(index_type=IVF_HNSW_SQ) - -``` - -### Query the index - -```python -# Search using a random 1536-dimensional embedding -tbl.search(np.random.random((1536))) \ - .limit(2) \ - .to_pandas() -``` diff --git a/docs/src/concepts/index_ivfpq.md b/docs/src/concepts/index_ivfpq.md deleted file mode 100644 index 0a9de0d4..00000000 --- a/docs/src/concepts/index_ivfpq.md +++ /dev/null @@ -1,86 +0,0 @@ -# Understanding LanceDB's IVF-PQ index - -An ANN (Approximate Nearest Neighbors) index is a data structure that represents data in a way that makes it more efficient to search and retrieve. Using an ANN index is faster, but less accurate than kNN or brute force search because, in essence, the index is a lossy representation of the data. - -LanceDB is fundamentally different from other vector databases in that it is built on top of [Lance](https://github.com/lancedb/lance), an open-source columnar data format designed for performant ML workloads and fast random access. Due to the design of Lance, LanceDB's indexing philosophy adopts a primarily *disk-based* indexing philosophy. - -## IVF-PQ - -IVF-PQ is a composite index that combines inverted file index (IVF) and product quantization (PQ). The implementation in LanceDB provides several parameters to fine-tune the index's size, query throughput, latency and recall, which are described later in this section. - -### Product quantization - -Quantization is a compression technique used to reduce the dimensionality of an embedding to speed up search. - -Product quantization (PQ) works by dividing a large, high-dimensional vector of size into equally sized subvectors. Each subvector is assigned a "reproduction value" that maps to the nearest centroid of points for that subvector. The reproduction values are then assigned to a codebook using unique IDs, which can be used to reconstruct the original vector. - -![](../assets/ivfpq_pq_desc.png) - -It's important to remember that quantization is a *lossy process*, i.e., the reconstructed vector is not identical to the original vector. This results in a trade-off between the size of the index and the accuracy of the search results. - -As an example, consider starting with 128-dimensional vector consisting of 32-bit floats. Quantizing it to an 8-bit integer vector with 4 dimensions as in the image above, we can significantly reduce memory requirements. - -!!! example "Effect of quantization" - - Original: `128 × 32 = 4096` bits - Quantized: `4 × 8 = 32` bits - - Quantization results in a **128x** reduction in memory requirements for each vector in the index, which is substantial. - -### Inverted file index - -While PQ helps with reducing the size of the index, IVF primarily addresses search performance. The primary purpose of an inverted file index is to facilitate rapid and effective nearest neighbor search by narrowing down the search space. - -In IVF, the PQ vector space is divided into *Voronoi cells*, which are essentially partitions that consist of all the points in the space that are within a threshold distance of the given region's seed point. These seed points are initialized by running K-means over the stored vectors. The centroids of K-means turn into the seed points which then each define a region. These regions are then are used to create an inverted index that correlates each centroid with a list of vectors in the space, allowing a search to be restricted to just a subset of vectors in the index. - -![](../assets/ivfpq_ivf_desc.webp) - -During query time, depending on where the query lands in vector space, it may be close to the border of multiple Voronoi cells, which could make the top-k results ambiguous and span across multiple cells. To address this, the IVF-PQ introduces the `nprobe` parameter, which controls the number of Voronoi cells to search during a query. The higher the `nprobe`, the more accurate the results, but the slower the query. - -![](../assets/ivfpq_query_vector.webp) - -## Putting it all together - -We can combine the above concepts to understand how to build and query an IVF-PQ index in LanceDB. - -### Construct index - -There are three key parameters to set when constructing an IVF-PQ index: - -* `metric`: Use an `l2` euclidean distance metric. We also support `dot` and `cosine` distance. -* `num_partitions`: The number of partitions in the IVF portion of the index. -* `num_sub_vectors`: The number of sub-vectors that will be created during Product Quantization (PQ). - -In Python, the index can be created as follows: - -```python -# Create and train the index for a 1536-dimensional vector -# Make sure you have enough data in the table for an effective training step -tbl.create_index(metric="l2", num_partitions=256, num_sub_vectors=96) -``` -!!! note - `num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset. - -The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters. - - -### Query the index - -```python -# Search using a random 1536-dimensional embedding -tbl.search(np.random.random((1536))) \ - .limit(2) \ - .nprobes(20) \ - .refine_factor(10) \ - .to_pandas() -``` - -The above query will perform a search on the table `tbl` using the given query vector, with the following parameters: - -* `limit`: The number of results to return -* `nprobes`: The number of probes determines the distribution of vector space. While a higher number enhances search accuracy, it also results in slower performance. Typically, setting `nprobes` to cover 5–10% of the dataset proves effective in achieving high recall with minimal latency. -* `refine_factor`: Refine the results by reading extra elements and re-ranking them in memory. A higher number makes the search more accurate but also slower (see the [FAQ](../faq.md#do-i-need-to-set-a-refine-factor-when-using-an-index) page for more details on this). -* `to_pandas()`: Convert the results to a pandas DataFrame - -And there you have it! You now understand what an IVF-PQ index is, and how to create and query it in LanceDB. -To see how to create an IVF-PQ index in LanceDB, take a look at the [ANN indexes](../ann_indexes.md) section. diff --git a/docs/src/concepts/storage.md b/docs/src/concepts/storage.md deleted file mode 100644 index 0bf3b23c..00000000 --- a/docs/src/concepts/storage.md +++ /dev/null @@ -1,80 +0,0 @@ -# Storage - -LanceDB is among the only vector databases built on top of multiple modular components designed from the ground-up to be efficient on disk. This gives it the unique benefit of being flexible enough to support multiple storage backends, including local NVMe, EBS, EFS and many other third-party APIs that connect to the cloud. - -It is important to understand the tradeoffs between cost and latency for your specific application and use case. This section will help you understand the tradeoffs between the different storage backends. - -## Storage options - -We've prepared a simple diagram to showcase the thought process that goes into choosing a storage backend when using LanceDB OSS, Cloud or Enterprise. - -![](../assets/lancedb_storage_tradeoffs.png) - -When architecting your system, you'd typically ask yourself the following questions to decide on a storage option: - -1. **Latency**: How fast do I need results? What do the p50 and also p95 look like? -2. **Scalability**: Can I scale up the amount of data and QPS easily? -3. **Cost**: To serve my application, what’s the all-in cost of *both* storage and serving infra? -4. **Reliability/Availability**: How does replication work? Is disaster recovery addressed? - -## Tradeoffs - -This section reviews the characteristics of each storage option in four dimensions: latency, scalability, cost and reliability. - -**We begin with the lowest cost option, and end with the lowest latency option.** - -### 1. S3 / GCS / Azure Blob Storage - -!!! tip "Lowest cost, highest latency" - - - **Latency** ⇒ Has the highest latency. p95 latency is also substantially worse than p50. In general you get results in the order of several hundred milliseconds - - **Scalability** ⇒ Infinite on storage, however, QPS will be limited by S3 concurrency limits - - **Cost** ⇒ Lowest (order of magnitude cheaper than other options) - - **Reliability/Availability** ⇒ Highly available, as blob storage like S3 are critical infrastructure that form the backbone of the internet. - -Another important point to note is that LanceDB is designed to separate storage from compute, and the underlying Lance format stores the data in numerous immutable fragments. Due to these factors, LanceDB is a great storage option that addresses the _N + 1_ query problem. i.e., when a high query throughput is required, query processes can run in a stateless manner and be scaled up and down as needed. - -### 2. EFS / GCS Filestore / Azure File Storage - -!!! info "Moderately low cost, moderately low latency (<100ms)" - - - **Latency** ⇒ Much better than object/blob storage but not as good as EBS/Local disk; < 100ms p95 achievable - - **Scalability** ⇒ High, but the bottleneck will be the IOPs limit, but when scaling you can provision multiple EFS volumes - - **Cost** ⇒ Significantly more expensive than S3 but still very cost effective compared to in-memory dbs. Inactive data in EFS is also automatically tiered to S3-level costs. - - **Reliability/Availability** ⇒ Highly available, as query nodes can go down without affecting EFS. However, EFS does not provide replication / backup - this must be managed manually. - -A recommended best practice is to keep a copy of the data on S3 for disaster recovery scenarios. If any downtime is unacceptable, then you would need another EFS with a copy of the data. This is still much cheaper than EC2 instances holding multiple copies of the data. - -### 3. Third-party storage solutions - -Solutions like [MinIO](https://blog.min.io/lancedb-trusted-steed-against-data-complexity/), WekaFS, etc. that deliver S3 compatible API with much better performance than S3. - -!!! info "Moderately low cost, moderately low latency (<100ms)" - - - **Latency** ⇒ Should be similar latency to EFS, better than S3 (<100ms) - - **Scalability** ⇒ Up to the solutions architect, who can add as many nodes to their MinIO or other third-party provider's cluster as needed - - **Cost** ⇒ Definitely higher than S3. The cost can be marginally higher than EFS until you get to maybe >10TB scale with high utilization - - **Reliability/Availability** ⇒ These are all shareable by lots of nodes, quality/cost of replication/backup depends on the vendor - - -### 4. EBS / GCP Persistent Disk / Azure Managed Disk - -!!! info "Very low latency (<30ms), higher cost" - - - **Latency** ⇒ Very good, pretty close to local disk. You’re looking at <30ms latency in most cases - - **Scalability** ⇒ EBS is not shareable between instances. If deployed via k8s, it can be shared between pods that live on the same instance, but beyond that you would need to shard data or make an additional copy - - **Cost** ⇒ Higher than EFS. There are some hidden costs to EBS as well if you’re paying for IO. - - **Reliability/Availability** ⇒ Not shareable between instances but can be shared between pods on the same instance. Survives instance termination. No automatic backups. - -Just like EFS, an EBS or persistent disk setup requires more manual work to manage data sharding, backups and capacity. - -### 5. Local disk (SSD/NVMe) - -!!! danger "Lowest latency (<10ms), highest cost" - - - **Latency** ⇒ Lowest latency with modern NVMe drives, <10ms p95 - - **Scalability** ⇒ Difficult to scale on cloud. Also need additional copies / sharding if QPS needs to be higher - - **Cost** ⇒ Highest cost; the main issue with keeping your application and storage tightly integrated is that it’s just not really possible to scale this up in cloud environments - - **Reliability/Availability** ⇒ If the instance goes down, so does your data. You have to be _very_ diligent about backing up your data - -As a rule of thumb, local disk should be your storage option if you require absolutely *crazy low* latency and you’re willing to do a bunch of data management work to make it happen. diff --git a/docs/src/concepts/vector_search.md b/docs/src/concepts/vector_search.md deleted file mode 100644 index 5a6f2018..00000000 --- a/docs/src/concepts/vector_search.md +++ /dev/null @@ -1,36 +0,0 @@ -# Vector search - -Vector search is a technique used to search for similar items based on their vector representations, called embeddings. It is also known as similarity search, nearest neighbor search, or approximate nearest neighbor search. - -Raw data (e.g. text, images, audio, etc.) is converted into embeddings via an embedding model, which are then stored in a vector database like LanceDB. To perform similarity search at scale, an index is created on the stored embeddings, which can then used to perform fast lookups. - -![](../assets/vector-db-basics.png) - -## Embeddings - -Modern machine learning models can be trained to convert raw data into embeddings, represented as arrays (or vectors) of floating point numbers of fixed dimensionality. What makes embeddings useful in practice is that the position of an embedding in vector space captures some of the semantics of the data, depending on the type of model and how it was trained. Points that are close to each other in vector space are considered similar (or appear in similar contexts), and points that are far away are considered dissimilar. - -Large datasets of multi-modal data (text, audio, images, etc.) can be converted into embeddings with the appropriate model. Projecting the vectors' principal components in 2D space results in groups of vectors that represent similar concepts clustering together, as shown below. - -![](../assets/embedding_intro.png) - -## Indexes - -Embeddings for a given dataset are made searchable via an **index**. The index is constructed by using data structures that store the embeddings such that it's very efficient to perform scans and lookups on them. A key distinguishing feature of LanceDB is it uses a disk-based index: IVF-PQ, which is a variant of the Inverted File Index (IVF) that uses Product Quantization (PQ) to compress the embeddings. - -See the [IVF-PQ](./index_ivfpq.md) page for more details on how it works. - -## Brute force search - -The simplest way to perform vector search is to perform a brute force search, without an index, where the distance between the query vector and all the vectors in the database are computed, with the top-k closest vectors returned. This is equivalent to a k-nearest neighbours (kNN) search in vector space. - -![](../assets/knn_search.png) - -As you can imagine, the brute force approach is not scalable for datasets larger than a few hundred thousand vectors, as the latency of the search grows linearly with the size of the dataset. This is where approximate nearest neighbour (ANN) algorithms come in. - -## Approximate nearest neighbour (ANN) search - -Instead of performing an exhaustive search on the entire database for each and every query, approximate nearest neighbour (ANN) algorithms use an index to narrow down the search space, which significantly reduces query latency. The trade-off is that the results are not guaranteed to be the true nearest neighbors of the query, but are usually "good enough" for most use cases. - - - diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md deleted file mode 100644 index 72a7e825..00000000 --- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md +++ /dev/null @@ -1,67 +0,0 @@ -# Imagebind embeddings -We have support for [imagebind](https://github.com/facebookresearch/ImageBind) model embeddings. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. - -This function is registered as `imagebind` and supports Audio, Video and Text modalities(extending to Thermal,Depth,IMU data): - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"imagebind_huge"` | Name of the model. | -| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | -| `normalize` | `bool` | `False` | set to `True` to normalize your inputs before model ingestion. | - -Below is an example demonstrating how the API works: - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect(tmp_path) -func = get_registry().get("imagebind").create() - -class ImageBindModel(LanceModel): - text: str - image_uri: str = func.SourceField() - audio_path: str - vector: Vector(func.ndims()) = func.VectorField() - -# add locally accessible image paths -text_list=["A dog.", "A car", "A bird"] -image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"] -audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"] - -# Load data -inputs = [ - {"text": a, "audio_path": b, "image_uri": c} - for a, b, c in zip(text_list, audio_paths, image_paths) -] - -#create table and add data -table = db.create_table("img_bind", schema=ImageBindModel) -table.add(inputs) -``` - -Now, we can search using any modality: - -#### image search -```python -query_image = "./assets/dog_image2.jpg" #download an image and enter that path here -actual = table.search(query_image).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "dog") -``` -#### audio search - -```python -query_audio = "./assets/car_audio2.wav" #download an audio clip and enter path here -actual = table.search(query_audio).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "car") -``` -#### Text search -You can add any input query and fetch the result as follows: -```python -query = "an animal which flies and tweets" -actual = table.search(query).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "bird") -``` - -If you have any questions about the embeddings API, supported models, or see a relevant model missing, please raise an issue [on GitHub](https://github.com/lancedb/lancedb/issues). diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md deleted file mode 100644 index 918c1509..00000000 --- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md +++ /dev/null @@ -1,51 +0,0 @@ -# Jina Embeddings : Multimodal - -Jina embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list -under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | - -Usage Example: - -```python - import os - import requests - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - import pandas as pd - - os.environ['JINA_API_KEY'] = 'jina_*' - - db = lancedb.connect("~/.lancedb") - func = get_registry().get("jina").create() - - - class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: bytes = func.SourceField() # image bytes as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - - - table = db.create_table("images", schema=Images) - labels = ["cat", "cat", "dog", "dog", "horse", "horse"] - uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", - ] - # get each uri as bytes - image_bytes = [requests.get(uri).content for uri in uris] - table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) - ) -``` diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md deleted file mode 100644 index eb6139f5..00000000 --- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md +++ /dev/null @@ -1,82 +0,0 @@ -# OpenClip embeddings -We support CLIP model embeddings using the open source alternative, [open-clip](https://github.com/mlfoundations/open_clip) which supports various customizations. It is registered as `open-clip` and supports the following customizations: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"ViT-B-32"` | The name of the model. | -| `pretrained` | `str` | `"laion2b_s34b_b79k"` | The name of the pretrained model to load. | -| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | -| `batch_size` | `int` | `64` | The number of images to process in a batch. | -| `normalize` | `bool` | `True` | Whether to normalize the input images before feeding them to the model. | - -This embedding function supports ingesting images as both bytes and urls. You can query them using both test and other images. - -!!! info - LanceDB supports ingesting images directly from accessible links. - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect(tmp_path) -func = get_registry().get("open-clip").create() - -class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: bytes = func.SourceField() # image bytes as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - -table = db.create_table("images", schema=Images) -labels = ["cat", "cat", "dog", "dog", "horse", "horse"] -uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", -] -# get each uri as bytes -image_bytes = [requests.get(uri).content for uri in uris] -table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) -) -``` -Now we can search using text from both the default vector column and the custom vector column -```python - -# text search -actual = table.search("man's best friend").limit(1).to_pydantic(Images)[0] -print(actual.label) # prints "dog" - -frombytes = ( - table.search("man's best friend", vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(frombytes.label) - -``` - -Because we're using a multi-modal embedding function, we can also search using images - -```python -# image search -query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" -image_bytes = requests.get(query_image_uri).content -query_image = Image.open(io.BytesIO(image_bytes)) -actual = table.search(query_image).limit(1).to_pydantic(Images)[0] -print(actual.label == "dog") - -# image search using a custom vector column -other = ( - table.search(query_image, vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(actual.label) - -``` diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md deleted file mode 100644 index fcda1422..00000000 --- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md +++ /dev/null @@ -1,97 +0,0 @@ -# VoyageAI Embeddings : Multimodal - -VoyageAI embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list -under [https://docs.voyageai.com/docs/multimodal-embeddings](https://docs.voyageai.com/docs/multimodal-embeddings) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|-------------------------|-------------------------------------------| -| `name` | `str` | `"voyage-multimodal-3"` | The model ID of the VoyageAI model to use | - -Usage Example: - -```python -import base64 -import os -from io import BytesIO - -import requests -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry -import pandas as pd - -os.environ['VOYAGE_API_KEY'] = 'YOUR_VOYAGE_API_KEY' - -db = lancedb.connect(".lancedb") -func = get_registry().get("voyageai").create(name="voyage-multimodal-3") - - -def image_to_base64(image_bytes: bytes): - buffered = BytesIO(image_bytes) - img_str = base64.b64encode(buffered.getvalue()) - return img_str.decode("utf-8") - - -class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: str = func.SourceField() # image bytes base64 encoded as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - - -if "images" in db.table_names(): - db.drop_table("images") -table = db.create_table("images", schema=Images) -labels = ["cat", "cat", "dog", "dog", "horse", "horse"] -uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", -] -# get each uri as bytes -images_bytes = [image_to_base64(requests.get(uri).content) for uri in uris] -table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": images_bytes}) -) -``` -Now we can search using text from both the default vector column and the custom vector column -```python - -# text search -actual = table.search("man's best friend", "vec_from_bytes").limit(1).to_pydantic(Images)[0] -print(actual.label) # prints "dog" - -frombytes = ( - table.search("man's best friend", vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(frombytes.label) - -``` - -Because we're using a multi-modal embedding function, we can also search using images - -```python -# image search -query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" -image_bytes = requests.get(query_image_uri).content -query_image = Image.open(BytesIO(image_bytes)) -actual = table.search(query_image, "vec_from_bytes").limit(1).to_pydantic(Images)[0] -print(actual.label == "dog") - -# image search using a custom vector column -other = ( - table.search(query_image, vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(actual.label) - -``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md deleted file mode 100644 index 036d4b82..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md +++ /dev/null @@ -1,51 +0,0 @@ -# AWS Bedrock Text Embedding Functions - -AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. -You can do so by using `awscli` and also add your session_token: -```shell -aws configure -aws configure set aws_session_token "" -``` -to ensure that the credentials are set up correctly, you can run the following command: -```shell -aws sts get-caller-identity -``` - -Supported Embedding modelIDs are: -* `amazon.titan-embed-text-v1` -* `cohere.embed-english-v3` -* `cohere.embed-multilingual-v3` - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| **name** | str | "amazon.titan-embed-text-v1" | The model ID of the bedrock model to use. Supported base models for Text Embeddings: amazon.titan-embed-text-v1, cohere.embed-english-v3, cohere.embed-multilingual-v3 | -| **region** | str | "us-east-1" | Optional name of the AWS Region in which the service should be called (e.g., "us-east-1"). | -| **profile_name** | str | None | Optional name of the AWS profile to use for calling the Bedrock service. If not specified, the default profile will be used. | -| **assumed_role** | str | None | Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not specified, the current active credentials will be used. | -| **role_session_name** | str | "lancedb-embeddings" | Optional name of the AWS IAM role session to use for calling the Bedrock service. If not specified, a "lancedb-embeddings" name will be used. | -| **runtime** | bool | True | Optional choice of getting different client to perform operations with the Amazon Bedrock service. | -| **max_retries** | int | 7 | Optional number of retries to perform when a request fails. | - -Usage Example: - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry -import pandas as pd - -model = get_registry().get("bedrock-text").create() - -class TextModel(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) -db = lancedb.connect("tmp_path") -tbl = db.create_table("test", schema=TextModel, mode="overwrite") - -tbl.add(df) -rs = tbl.search("hello").limit(1).to_pandas() -``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md deleted file mode 100644 index fd99f2ca..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md +++ /dev/null @@ -1,63 +0,0 @@ -# Cohere Embeddings - -Using cohere API requires cohere package, which can be installed using `pip install cohere`. Cohere embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. -You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API. - -Supported models are: - -- embed-english-v3.0 -- embed-multilingual-v3.0 -- embed-english-light-v3.0 -- embed-multilingual-light-v3.0 -- embed-english-v2.0 -- embed-english-light-v2.0 -- embed-multilingual-v2.0 - - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|--------|---------| -| `name` | `str` | `"embed-english-v2.0"` | The model ID of the cohere model to use. Supported base models for Text Embeddings: embed-english-v3.0, embed-multilingual-v3.0, embed-english-light-v3.0, embed-multilingual-light-v3.0, embed-english-v2.0, embed-english-light-v2.0, embed-multilingual-v2.0 | -| `source_input_type` | `str` | `"search_document"` | The type of input data to be used for the source column. | -| `query_input_type` | `str` | `"search_query"` | The type of input data to be used for the query. | - -Cohere supports following input types: - -| Input Type | Description | -|-------------------------|---------------------------------------| -| "`search_document`" | Used for embeddings stored in a vector| -| | database for search use-cases. | -| "`search_query`" | Used for embeddings of search queries | -| | run against a vector DB | -| "`semantic_similarity`" | Specifies the given text will be used | -| | for Semantic Textual Similarity (STS) | -| "`classification`" | Used for embeddings passed through a | -| | text classifier. | -| "`clustering`" | Used for the embeddings run through a | -| | clustering algorithm | - -Usage Example: - -```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - cohere = EmbeddingFunctionRegistry - .get_instance() - .get("cohere") - .create(name="embed-multilingual-v2.0") - - class TextModel(LanceModel): - text: str = cohere.SourceField() - vector: Vector(cohere.ndims()) = cohere.VectorField() - - data = [ { "text": "hello world" }, - { "text": "goodbye world" }] - - db = lancedb.connect("~/.lancedb") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) -``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md deleted file mode 100644 index 551c8327..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md +++ /dev/null @@ -1,35 +0,0 @@ -# Gemini Embeddings -With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide. -The Gemini Embedding Model API supports various task types: - -| Task Type | Description | -|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| -| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. | -| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API | -| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). | -| "`classification`" | Specifies that the embeddings will be used for classification. | -| "`clusering`" | Specifies that the embeddings will be used for clustering. | - - -Usage Example: - -```python -import lancedb -import pandas as pd -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - - -model = get_registry().get("gemini-text").create() - -class TextModel(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("test", schema=TextModel, mode="overwrite") - -tbl.add(df) -rs = tbl.search("hello").limit(1).to_pandas() -``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md deleted file mode 100644 index eb0dfdea..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md +++ /dev/null @@ -1,24 +0,0 @@ -# Huggingface embedding models -We offer support for all Hugging Face models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")`. Some Hugging Face models might require custom models defined on the HuggingFace Hub in their own modeling files. You may enable this by setting `trust_remote_code=True`. This option should only be set to True for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - -Example usage - -```python -import lancedb -import pandas as pd - -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector - -model = get_registry().get("huggingface").create(name='facebook/bart-base') - -class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]}) -table = db.create_table("greets", schema=Words) -table.add(df) -query = "old greeting" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md deleted file mode 100644 index d98fdeef..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md +++ /dev/null @@ -1,75 +0,0 @@ -# IBM watsonx.ai Embeddings - -Generate text embeddings using IBM's watsonx.ai platform. - -## Supported Models - -You can find a list of supported models at [IBM watsonx.ai Documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). The currently supported model names are: - -- `ibm/slate-125m-english-rtrvr` -- `ibm/slate-30m-english-rtrvr` -- `sentence-transformers/all-minilm-l12-v2` -- `intfloat/multilingual-e5-large` - -## Parameters - -The following parameters can be passed to the `create` method: - -| Parameter | Type | Default Value | Description | -|------------|----------|----------------------------------|-----------------------------------------------------------| -| name | str | "ibm/slate-125m-english-rtrvr" | The model ID of the watsonx.ai model to use | -| api_key | str | None | Optional IBM Cloud API key (or set `WATSONX_API_KEY`) | -| project_id | str | None | Optional watsonx project ID (or set `WATSONX_PROJECT_ID`) | -| url | str | None | Optional custom URL for the watsonx.ai instance | -| params | dict | None | Optional additional parameters for the embedding model | - -## Usage Example - -First, the watsonx.ai library is an optional dependency, so must be installed seperately: - -``` -pip install ibm-watsonx-ai -``` - -Optionally set environment variables (if not passing credentials to `create` directly): - -```sh -export WATSONX_API_KEY="YOUR_WATSONX_API_KEY" -export WATSONX_PROJECT_ID="YOUR_WATSONX_PROJECT_ID" -``` - -```python -import os -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import EmbeddingFunctionRegistry - -watsonx_embed = EmbeddingFunctionRegistry - .get_instance() - .get("watsonx") - .create( - name="ibm/slate-125m-english-rtrvr", - # Uncomment and set these if not using environment variables - # api_key="your_api_key_here", - # project_id="your_project_id_here", - # url="your_watsonx_url_here", - # params={...}, - ) - -class TextModel(LanceModel): - text: str = watsonx_embed.SourceField() - vector: Vector(watsonx_embed.ndims()) = watsonx_embed.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"}, -] - -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("watsonx_test", schema=TextModel, mode="overwrite") - -tbl.add(data) - -rs = tbl.search("hello").limit(1).to_pandas() -print(rs) -``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md deleted file mode 100644 index 30662f21..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md +++ /dev/null @@ -1,50 +0,0 @@ -# Instructor Embeddings -[Instructor](https://instructor-embedding.github.io/) is an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g. classification, retrieval, clustering, text evaluation, etc.) and domains (e.g. science, finance, etc.) by simply providing the task instruction, without any finetuning. - -If you want to calculate customized embeddings for specific sentences, you can follow the unified template to write instructions. - -!!! info - Represent the `domain` `text_type` for `task_objective`: - - * `domain` is optional, and it specifies the domain of the text, e.g. science, finance, medicine, etc. - * `text_type` is required, and it specifies the encoding unit, e.g. sentence, document, paragraph, etc. - * `task_objective` is optional, and it specifies the objective of embedding, e.g. retrieve a document, classify the sentence, etc. - -More information about the model can be found at the [source URL](https://github.com/xlang-ai/instructor-embedding). - -| Argument | Type | Default | Description | -|---|---|---|---| -| `name` | `str` | "hkunlp/instructor-base" | The name of the model to use | -| `batch_size` | `int` | `32` | The batch size to use when generating embeddings | -| `device` | `str` | `"cpu"` | The device to use when generating embeddings | -| `show_progress_bar` | `bool` | `True` | Whether to show a progress bar when generating embeddings | -| `normalize_embeddings` | `bool` | `True` | Whether to normalize the embeddings | -| `quantize` | `bool` | `False` | Whether to quantize the model | -| `source_instruction` | `str` | `"represent the docuement for retreival"` | The instruction for the source column | -| `query_instruction` | `str` | `"represent the document for retreiving the most similar documents"` | The instruction for the query | - - - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry, InstuctorEmbeddingFunction - -instructor = get_registry().get("instructor").create( - source_instruction="represent the docuement for retreival", - query_instruction="represent the document for retreiving the most similar documents" - ) - -class Schema(LanceModel): - vector: Vector(instructor.ndims()) = instructor.VectorField() - text: str = instructor.SourceField() - -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("test", schema=Schema, mode="overwrite") - -texts = [{"text": "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that..."}, - {"text": "The disparate impact theory is especially controversial under the Fair Housing Act because the Act..."}, - {"text": "Disparate impact in United States labor law refers to practices in employment, housing, and other areas that.."}] - -tbl.add(texts) -``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md deleted file mode 100644 index dc194c5d..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md +++ /dev/null @@ -1,39 +0,0 @@ -# Jina Embeddings - -Jina embeddings are used to generate embeddings for text and image data. -You also need to set the `JINA_API_KEY` environment variable to use the Jina API. - -You can find a list of supported models under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | - -Usage Example: - -```python - import os - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - os.environ['JINA_API_KEY'] = 'jina_*' - - jina_embed = EmbeddingFunctionRegistry.get_instance().get("jina").create(name="jina-embeddings-v2-base-en") - - - class TextModel(LanceModel): - text: str = jina_embed.SourceField() - vector: Vector(jina_embed.ndims()) = jina_embed.VectorField() - - - data = [{"text": "hello world"}, - {"text": "goodbye world"}] - - db = lancedb.connect("~/.lancedb-2") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) -``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md deleted file mode 100644 index 3b8cfcce..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md +++ /dev/null @@ -1,37 +0,0 @@ -# Ollama embeddings - -Generate embeddings via the [ollama](https://github.com/ollama/ollama-python) python library. More details: - -- [Ollama docs on embeddings](https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings) -- [Ollama blog on embeddings](https://ollama.com/blog/embedding-models) - -| Parameter | Type | Default Value | Description | -|------------------------|----------------------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| `name` | `str` | `nomic-embed-text` | The name of the model. | -| `host` | `str` | `http://localhost:11434` | The Ollama host to connect to. | -| `options` | `ollama.Options` or `dict` | `None` | Additional model parameters listed in the documentation for the Modelfile such as `temperature`. | -| `keep_alive` | `float` or `str` | `"5m"` | Controls how long the model will stay loaded into memory following the request. | -| `ollama_client_kwargs` | `dict` | `{}` | kwargs that can be past to the `ollama.Client`. | - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -func = get_registry().get("ollama").create(name="nomic-embed-text") - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words, mode="overwrite") -table.add([ - {"text": "hello world"}, - {"text": "goodbye world"} -]) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md deleted file mode 100644 index f08253c0..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md +++ /dev/null @@ -1,35 +0,0 @@ -# OpenAI embeddings - -LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"text-embedding-ada-002"` | The name of the model. | -| `dim` | `int` | Model default | For OpenAI's newer text-embedding-3 model, we can specify a dimensionality that is smaller than the 1536 size. This feature supports it | -| `use_azure` | bool | `False` | Set true to use Azure OpenAPI SDK | - - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -func = get_registry().get("openai").create(name="text-embedding-ada-002") - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words, mode="overwrite") -table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md deleted file mode 100644 index 1adff158..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md +++ /dev/null @@ -1,174 +0,0 @@ -# Sentence transformers -Allows you to set parameters when registering a `sentence-transformers` object. - -!!! info - Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search. - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `all-MiniLM-L6-v2` | The name of the model | -| `device` | `str` | `cpu` | The device to run the model on (can be `cpu` or `gpu`) | -| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model | -| `trust_remote_code` | `bool` | `False` | Whether to trust and execute remote code from the model's Huggingface repository | - - -??? "Check out available sentence-transformer models here!" - ```markdown - - sentence-transformers/all-MiniLM-L12-v2 - - sentence-transformers/paraphrase-mpnet-base-v2 - - sentence-transformers/gtr-t5-base - - sentence-transformers/LaBSE - - sentence-transformers/all-MiniLM-L6-v2 - - sentence-transformers/bert-base-nli-max-tokens - - sentence-transformers/bert-base-nli-mean-tokens - - sentence-transformers/bert-base-nli-stsb-mean-tokens - - sentence-transformers/bert-base-wikipedia-sections-mean-tokens - - sentence-transformers/bert-large-nli-cls-token - - sentence-transformers/bert-large-nli-max-tokens - - sentence-transformers/bert-large-nli-mean-tokens - - sentence-transformers/bert-large-nli-stsb-mean-tokens - - sentence-transformers/distilbert-base-nli-max-tokens - - sentence-transformers/distilbert-base-nli-mean-tokens - - sentence-transformers/distilbert-base-nli-stsb-mean-tokens - - sentence-transformers/distilroberta-base-msmarco-v1 - - sentence-transformers/distilroberta-base-msmarco-v2 - - sentence-transformers/nli-bert-base-cls-pooling - - sentence-transformers/nli-bert-base-max-pooling - - sentence-transformers/nli-bert-base - - sentence-transformers/nli-bert-large-cls-pooling - - sentence-transformers/nli-bert-large-max-pooling - - sentence-transformers/nli-bert-large - - sentence-transformers/nli-distilbert-base-max-pooling - - sentence-transformers/nli-distilbert-base - - sentence-transformers/nli-roberta-base - - sentence-transformers/nli-roberta-large - - sentence-transformers/roberta-base-nli-mean-tokens - - sentence-transformers/roberta-base-nli-stsb-mean-tokens - - sentence-transformers/roberta-large-nli-mean-tokens - - sentence-transformers/roberta-large-nli-stsb-mean-tokens - - sentence-transformers/stsb-bert-base - - sentence-transformers/stsb-bert-large - - sentence-transformers/stsb-distilbert-base - - sentence-transformers/stsb-roberta-base - - sentence-transformers/stsb-roberta-large - - sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens - - sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens - - sentence-transformers/xlm-r-base-en-ko-nli-ststb - - sentence-transformers/xlm-r-bert-base-nli-mean-tokens - - sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens - - sentence-transformers/xlm-r-large-en-ko-nli-ststb - - sentence-transformers/bert-base-nli-cls-token - - sentence-transformers/all-distilroberta-v1 - - sentence-transformers/multi-qa-MiniLM-L6-dot-v1 - - sentence-transformers/multi-qa-distilbert-cos-v1 - - sentence-transformers/multi-qa-distilbert-dot-v1 - - sentence-transformers/multi-qa-mpnet-base-cos-v1 - - sentence-transformers/multi-qa-mpnet-base-dot-v1 - - sentence-transformers/nli-distilroberta-base-v2 - - sentence-transformers/all-MiniLM-L6-v1 - - sentence-transformers/all-mpnet-base-v1 - - sentence-transformers/all-mpnet-base-v2 - - sentence-transformers/all-roberta-large-v1 - - sentence-transformers/allenai-specter - - sentence-transformers/average_word_embeddings_glove.6B.300d - - sentence-transformers/average_word_embeddings_glove.840B.300d - - sentence-transformers/average_word_embeddings_komninos - - sentence-transformers/average_word_embeddings_levy_dependency - - sentence-transformers/clip-ViT-B-32-multilingual-v1 - - sentence-transformers/clip-ViT-B-32 - - sentence-transformers/distilbert-base-nli-stsb-quora-ranking - - sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking - - sentence-transformers/distilroberta-base-paraphrase-v1 - - sentence-transformers/distiluse-base-multilingual-cased-v1 - - sentence-transformers/distiluse-base-multilingual-cased-v2 - - sentence-transformers/distiluse-base-multilingual-cased - - sentence-transformers/facebook-dpr-ctx_encoder-multiset-base - - sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base - - sentence-transformers/facebook-dpr-question_encoder-multiset-base - - sentence-transformers/facebook-dpr-question_encoder-single-nq-base - - sentence-transformers/gtr-t5-large - - sentence-transformers/gtr-t5-xl - - sentence-transformers/gtr-t5-xxl - - sentence-transformers/msmarco-MiniLM-L-12-v3 - - sentence-transformers/msmarco-MiniLM-L-6-v3 - - sentence-transformers/msmarco-MiniLM-L12-cos-v5 - - sentence-transformers/msmarco-MiniLM-L6-cos-v5 - - sentence-transformers/msmarco-bert-base-dot-v5 - - sentence-transformers/msmarco-bert-co-condensor - - sentence-transformers/msmarco-distilbert-base-dot-prod-v3 - - sentence-transformers/msmarco-distilbert-base-tas-b - - sentence-transformers/msmarco-distilbert-base-v2 - - sentence-transformers/msmarco-distilbert-base-v3 - - sentence-transformers/msmarco-distilbert-base-v4 - - sentence-transformers/msmarco-distilbert-cos-v5 - - sentence-transformers/msmarco-distilbert-dot-v5 - - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned - - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch - - sentence-transformers/msmarco-distilroberta-base-v2 - - sentence-transformers/msmarco-roberta-base-ance-firstp - - sentence-transformers/msmarco-roberta-base-v2 - - sentence-transformers/msmarco-roberta-base-v3 - - sentence-transformers/multi-qa-MiniLM-L6-cos-v1 - - sentence-transformers/nli-mpnet-base-v2 - - sentence-transformers/nli-roberta-base-v2 - - sentence-transformers/nq-distilbert-base-v1 - - sentence-transformers/paraphrase-MiniLM-L12-v2 - - sentence-transformers/paraphrase-MiniLM-L3-v2 - - sentence-transformers/paraphrase-MiniLM-L6-v2 - - sentence-transformers/paraphrase-TinyBERT-L6-v2 - - sentence-transformers/paraphrase-albert-base-v2 - - sentence-transformers/paraphrase-albert-small-v2 - - sentence-transformers/paraphrase-distilroberta-base-v1 - - sentence-transformers/paraphrase-distilroberta-base-v2 - - sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 - - sentence-transformers/paraphrase-multilingual-mpnet-base-v2 - - sentence-transformers/paraphrase-xlm-r-multilingual-v1 - - sentence-transformers/quora-distilbert-base - - sentence-transformers/quora-distilbert-multilingual - - sentence-transformers/sentence-t5-base - - sentence-transformers/sentence-t5-large - - sentence-transformers/sentence-t5-xxl - - sentence-transformers/sentence-t5-xl - - sentence-transformers/stsb-distilroberta-base-v2 - - sentence-transformers/stsb-mpnet-base-v2 - - sentence-transformers/stsb-roberta-base-v2 - - sentence-transformers/stsb-xlm-r-multilingual - - sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1 - - sentence-transformers/clip-ViT-L-14 - - sentence-transformers/clip-ViT-B-16 - - sentence-transformers/use-cmlm-multilingual - - sentence-transformers/all-MiniLM-L12-v1 - ``` - -!!! info - You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc. - See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers). - -!!! note "BAAI Embeddings example" - Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - - db = lancedb.connect("/tmp/db") - model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") - - class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - - table = db.create_table("words", schema=Words) - table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - - query = "greetings" - actual = table.search(query).limit(1).to_pydantic(Words)[0] - print(actual.text) - ``` -Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. - diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md deleted file mode 100644 index beb0b7f7..00000000 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md +++ /dev/null @@ -1,51 +0,0 @@ -# VoyageAI Embeddings - -Voyage AI provides cutting-edge embedding and rerankers. - - -Using voyageai API requires voyageai package, which can be installed using `pip install voyageai`. Voyage AI embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. -You also need to set the `VOYAGE_API_KEY` environment variable to use the VoyageAI API. - -Supported models are: - -- voyage-3 -- voyage-3-lite -- voyage-finance-2 -- voyage-multilingual-2 -- voyage-law-2 -- voyage-code-2 - - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|--------|---------| -| `name` | `str` | `None` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 | -| `input_type` | `str` | `None` | Type of the input text. Default to None. Other options: query, document. | -| `truncation` | `bool` | `True` | Whether to truncate the input texts to fit within the context length. | - - -Usage Example: - -```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - voyageai = EmbeddingFunctionRegistry - .get_instance() - .get("voyageai") - .create(name="voyage-3") - - class TextModel(LanceModel): - text: str = voyageai.SourceField() - vector: Vector(voyageai.ndims()) = voyageai.VectorField() - - data = [ { "text": "hello world" }, - { "text": "goodbye world" }] - - db = lancedb.connect("~/.lancedb") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) -``` \ No newline at end of file diff --git a/docs/src/embeddings/custom_embedding_function.md b/docs/src/embeddings/custom_embedding_function.md deleted file mode 100644 index 655c6904..00000000 --- a/docs/src/embeddings/custom_embedding_function.md +++ /dev/null @@ -1,248 +0,0 @@ -To use your own custom embedding function, you can follow these 2 simple steps: - -1. Create your embedding function by implementing the `EmbeddingFunction` interface -2. Register your embedding function in the global `EmbeddingFunctionRegistry`. - -Let us see how this looks like in action. - -![](../assets/embeddings_api.png) - -`EmbeddingFunction` and `EmbeddingFunctionRegistry` handle low-level details for serializing schema and model information as metadata. To build a custom embedding function, you don't have to worry about the finer details - simply focus on setting up the model and leave the rest to LanceDB. - -## `TextEmbeddingFunction` interface - -There is another optional layer of abstraction available: `TextEmbeddingFunction`. You can use this abstraction if your model isn't multi-modal in nature and only needs to operate on text. In such cases, both the source and vector fields will have the same work for vectorization, so you simply just need to setup the model and rest is handled by `TextEmbeddingFunction`. You can read more about the class and its attributes in the class reference. - -Let's implement `SentenceTransformerEmbeddings` class. All you need to do is implement the `generate_embeddings()` and `ndims` function to handle the input types you expect and register the class in the global `EmbeddingFunctionRegistry` - - -=== "Python" - - ```python - from lancedb.embeddings import register - from lancedb.util import attempt_import_or_raise - - @register("sentence-transformers") - class SentenceTransformerEmbeddings(TextEmbeddingFunction): - name: str = "all-MiniLM-L6-v2" - # set more default instance vars like device, etc. - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._ndims = None - - def generate_embeddings(self, texts): - return self._embedding_model().encode(list(texts), ...).tolist() - - def ndims(self): - if self._ndims is None: - self._ndims = len(self.generate_embeddings("foo")[0]) - return self._ndims - - @cached(cache={}) - def _embedding_model(self): - return sentence_transformers.SentenceTransformer(name) - ``` - -=== "TypeScript" - - ```ts - --8<--- "nodejs/examples/custom_embedding_function.test.ts:imports" - - --8<--- "nodejs/examples/custom_embedding_function.test.ts:embedding_impl" - ``` - - -This is a stripped down version of our implementation of `SentenceTransformerEmbeddings` that removes certain optimizations and default settings. - -!!! danger "Use sensitive keys to prevent leaking secrets" - To prevent leaking secrets, such as API keys, you should add any sensitive - parameters of an embedding function to the output of the - [sensitive_keys()][lancedb.embeddings.base.EmbeddingFunction.sensitive_keys] / - [getSensitiveKeys()](../../js/namespaces/embedding/classes/EmbeddingFunction/#getsensitivekeys) - method. This prevents users from accidentally instantiating the embedding - function with hard-coded secrets. - -Now you can use this embedding function to create your table schema and that's it! you can then ingest data and run queries without manually vectorizing the inputs. - -=== "Python" - - ```python - from lancedb.pydantic import LanceModel, Vector - - registry = EmbeddingFunctionRegistry.get_instance() - stransformer = registry.get("sentence-transformers").create() - - class TextModelSchema(LanceModel): - vector: Vector(stransformer.ndims) = stransformer.VectorField() - text: str = stransformer.SourceField() - - tbl = db.create_table("table", schema=TextModelSchema) - - tbl.add(pd.DataFrame({"text": ["halo", "world"]})) - result = tbl.search("world").limit(5) - ``` - -=== "TypeScript" - - ```ts - --8<--- "nodejs/examples/custom_embedding_function.test.ts:call_custom_function" - ``` - -!!! note - - You can always implement the `EmbeddingFunction` interface directly if you want or need to, `TextEmbeddingFunction` just makes it much simpler and faster for you to do so, by setting up the boiler plat for text-specific use case - -## Multi-modal embedding function example -You can also use the `EmbeddingFunction` interface to implement more complex workflows such as multi-modal embedding function support. - -=== "Python" - - LanceDB implements `OpenClipEmeddingFunction` class that suppports multi-modal seach. Here's the implementation that you can use as a reference to build your own multi-modal embedding functions. - - ```python - @register("open-clip") - class OpenClipEmbeddings(EmbeddingFunction): - name: str = "ViT-B-32" - pretrained: str = "laion2b_s34b_b79k" - device: str = "cpu" - batch_size: int = 64 - normalize: bool = True - _model = PrivateAttr() - _preprocess = PrivateAttr() - _tokenizer = PrivateAttr() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - open_clip = attempt_import_or_raise("open_clip", "open-clip") # EmbeddingFunction util to import external libs and raise if not found - model, _, preprocess = open_clip.create_model_and_transforms( - self.name, pretrained=self.pretrained - ) - model.to(self.device) - self._model, self._preprocess = model, preprocess - self._tokenizer = open_clip.get_tokenizer(self.name) - self._ndims = None - - def ndims(self): - if self._ndims is None: - self._ndims = self.generate_text_embeddings("foo").shape[0] - return self._ndims - - def compute_query_embeddings( - self, query: Union[str, "PIL.Image.Image"], *args, **kwargs - ) -> List[np.ndarray]: - """ - Compute the embeddings for a given user query - - Parameters - ---------- - query : Union[str, PIL.Image.Image] - The query to embed. A query can be either text or an image. - """ - if isinstance(query, str): - return [self.generate_text_embeddings(query)] - else: - PIL = attempt_import_or_raise("PIL", "pillow") - if isinstance(query, PIL.Image.Image): - return [self.generate_image_embedding(query)] - else: - raise TypeError("OpenClip supports str or PIL Image as query") - - def generate_text_embeddings(self, text: str) -> np.ndarray: - torch = attempt_import_or_raise("torch") - text = self.sanitize_input(text) - text = self._tokenizer(text) - text.to(self.device) - with torch.no_grad(): - text_features = self._model.encode_text(text.to(self.device)) - if self.normalize: - text_features /= text_features.norm(dim=-1, keepdim=True) - return text_features.cpu().numpy().squeeze() - - def sanitize_input(self, images: IMAGES) -> Union[List[bytes], np.ndarray]: - """ - Sanitize the input to the embedding function. - """ - if isinstance(images, (str, bytes)): - images = [images] - elif isinstance(images, pa.Array): - images = images.to_pylist() - elif isinstance(images, pa.ChunkedArray): - images = images.combine_chunks().to_pylist() - return images - - def compute_source_embeddings( - self, images: IMAGES, *args, **kwargs - ) -> List[np.array]: - """ - Get the embeddings for the given images - """ - images = self.sanitize_input(images) - embeddings = [] - for i in range(0, len(images), self.batch_size): - j = min(i + self.batch_size, len(images)) - batch = images[i:j] - embeddings.extend(self._parallel_get(batch)) - return embeddings - - def _parallel_get(self, images: Union[List[str], List[bytes]]) -> List[np.ndarray]: - """ - Issue concurrent requests to retrieve the image data - """ - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [ - executor.submit(self.generate_image_embedding, image) - for image in images - ] - return [future.result() for future in futures] - - def generate_image_embedding( - self, image: Union[str, bytes, "PIL.Image.Image"] - ) -> np.ndarray: - """ - Generate the embedding for a single image - - Parameters - ---------- - image : Union[str, bytes, PIL.Image.Image] - The image to embed. If the image is a str, it is treated as a uri. - If the image is bytes, it is treated as the raw image bytes. - """ - torch = attempt_import_or_raise("torch") - # TODO handle retry and errors for https - image = self._to_pil(image) - image = self._preprocess(image).unsqueeze(0) - with torch.no_grad(): - return self._encode_and_normalize_image(image) - - def _to_pil(self, image: Union[str, bytes]): - PIL = attempt_import_or_raise("PIL", "pillow") - if isinstance(image, bytes): - return PIL.Image.open(io.BytesIO(image)) - if isinstance(image, PIL.Image.Image): - return image - elif isinstance(image, str): - parsed = urlparse.urlparse(image) - # TODO handle drive letter on windows. - if parsed.scheme == "file": - return PIL.Image.open(parsed.path) - elif parsed.scheme == "": - return PIL.Image.open(image if os.name == "nt" else parsed.path) - elif parsed.scheme.startswith("http"): - return PIL.Image.open(io.BytesIO(url_retrieve(image))) - else: - raise NotImplementedError("Only local and http(s) urls are supported") - - def _encode_and_normalize_image(self, image_tensor: "torch.Tensor"): - """ - encode a single image tensor and optionally normalize the output - """ - image_features = self._model.encode_image(image_tensor) - if self.normalize: - image_features /= image_features.norm(dim=-1, keepdim=True) - return image_features.cpu().numpy().squeeze() - ``` - -=== "TypeScript" - - Coming Soon! See this [issue](https://github.com/lancedb/lancedb/issues/1482) to track the status! diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md deleted file mode 100644 index 5d99ec7e..00000000 --- a/docs/src/embeddings/default_embedding_functions.md +++ /dev/null @@ -1,86 +0,0 @@ -# 📚 Available Embedding Models - -There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. 🚀 - -Before jumping on the list of available models, let's understand how to get an embedding model initialized and configured to use in our code: - -!!! example "Example usage" - ```python - model = get_registry() - .get("openai") - .create(name="text-embedding-ada-002") - ``` - -Now let's understand the above syntax: -```python -model = get_registry().get("model_id").create(...params) -``` -**This👆 line effectively creates a configured instance of an `embedding function` with `model` of choice that is ready for use.** - -- `get_registry()` : This function call returns an instance of a `EmbeddingFunctionRegistry` object. This registry manages the registration and retrieval of embedding functions. - -- `.get("model_id")` : This method call on the registry object and retrieves the **embedding models functions** associated with the `"model_id"` (1) . - { .annotate } - - 1. Hover over the names in table below to find out the `model_id` of different embedding functions. - -- `.create(...params)` : This method call is on the object returned by the `get` method. It instantiates an embedding model function using the **specified parameters**. - -??? question "What parameters does the `.create(...params)` method accepts?" - **Checkout the documentation of specific embedding models (links in the table below👇) to know what parameters it takes**. - -!!! tip "Moving on" - Now that we know how to get the **desired embedding model** and use it in our code, let's explore the comprehensive **list** of embedding models **supported by LanceDB**, in the tables below. - -## Text Embedding Functions 📝 -These functions are registered by default to handle text embeddings. - -- 🔄 **Embedding functions** have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with **exponential backoff**. - -- 🌕 Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7. - -🌟 **Available Text Embeddings** - -| **Embedding** :material-information-outline:{ title="Hover over the name to find out the model_id" } | **Description** | **Documentation** | -|-----------|-------------|---------------| -| [**Sentence Transformers**](available_embedding_models/text_embedding_functions/sentence_transformers.md "sentence-transformers") | 🧠 **SentenceTransformers** is a Python framework for state-of-the-art sentence, text, and image embeddings. | [Sentence Transformers Icon](available_embedding_models/text_embedding_functions/sentence_transformers.md)| -| [**Huggingface Models**](available_embedding_models/text_embedding_functions/huggingface_embedding.md "huggingface") |🤗 We offer support for all **Huggingface** models. The default model is `colbert-ir/colbertv2.0`. | [Huggingface Icon](available_embedding_models/text_embedding_functions/huggingface_embedding.md) | -| [**Ollama Embeddings**](available_embedding_models/text_embedding_functions/ollama_embedding.md "ollama") | 🔍 Generate embeddings via the **Ollama** python library. Ollama supports embedding models, making it possible to build RAG apps. | [Ollama Icon](available_embedding_models/text_embedding_functions/ollama_embedding.md)| -| [**OpenAI Embeddings**](available_embedding_models/text_embedding_functions/openai_embedding.md "openai")| 🔑 **OpenAI’s** text embeddings measure the relatedness of text strings. **LanceDB** supports state-of-the-art embeddings from OpenAI. | [OpenAI Icon](available_embedding_models/text_embedding_functions/openai_embedding.md)| -| [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | 📚 **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [Instructor Embedding Icon](available_embedding_models/text_embedding_functions/instructor_embedding.md) | -| [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | 🌌 Google’s Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [Gemini Icon](available_embedding_models/text_embedding_functions/gemini_embedding.md) | -| [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | 💬 This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [Cohere Icon](available_embedding_models/text_embedding_functions/cohere_embedding.md) | -| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [Jina Icon](available_embedding_models/text_embedding_functions/jina_embedding.md) | -| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [AWS Bedrock Icon](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) | -| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [Watsonx Icon](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) | -| [**VoyageAI Embeddings**](available_embedding_models/text_embedding_functions/voyageai_embedding.md "voyageai") | 🌕 Voyage AI provides cutting-edge embedding and rerankers. This will help you get started with **VoyageAI** embedding models using LanceDB. Using voyageai API requires voyageai package. Install it via `pip`. | [VoyageAI Icon](available_embedding_models/text_embedding_functions/voyageai_embedding.md) | - - - -[st-key]: "sentence-transformers" -[hf-key]: "huggingface" -[ollama-key]: "ollama" -[openai-key]: "openai" -[instructor-key]: "instructor" -[gemini-key]: "gemini-text" -[cohere-key]: "cohere" -[jina-key]: "jina" -[aws-key]: "bedrock-text" -[watsonx-key]: "watsonx" -[voyageai-key]: "voyageai" - - -## Multi-modal Embedding Functions🖼️ - -Multi-modal embedding functions allow you to query your table using both images and text. 💬🖼️ - -🌐 **Available Multi-modal Embeddings** - -| Embedding :material-information-outline:{ title="Hover over the name to find out the model_id" } | Description | Documentation | -|-----------|-------------|---------------| -| [**OpenClip Embeddings**](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md "open-clip") | 🎨 We support CLIP model embeddings using the open source alternative, **open-clip** which supports various customizations. | [openclip Icon](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) | -| [**Imagebind Embeddings**](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md "imageind") | 🌌 We have support for **imagebind model embeddings**. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. | [imagebind Icon](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)| -| [**Jina Multi-modal Embeddings**](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md "jina") | 🔗 **Jina embeddings** can also be used to embed both **text** and **image** data, only some of the models support image data and you can check the detailed documentation. 👉 | [jina Icon](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) | - -!!! note - If you'd like to request support for additional **embedding functions**, please feel free to open an issue on our LanceDB [GitHub issue page](https://github.com/lancedb/lancedb/issues). \ No newline at end of file diff --git a/docs/src/embeddings/embedding_functions.md b/docs/src/embeddings/embedding_functions.md deleted file mode 100644 index de9d29e7..00000000 --- a/docs/src/embeddings/embedding_functions.md +++ /dev/null @@ -1,206 +0,0 @@ -Representing multi-modal data as vector embeddings is becoming a standard practice. Embedding functions can themselves be thought of as key part of the data processing pipeline that each request has to be passed through. The assumption here is: after initial setup, these components and the underlying methodology are not expected to change for a particular project. - -For this purpose, LanceDB introduces an **embedding functions API**, that allow you simply set up once, during the configuration stage of your project. After this, the table remembers it, effectively making the embedding functions *disappear in the background* so you don't have to worry about manually passing callables, and instead, simply focus on the rest of your data engineering pipeline. - -!!! Note "Embedding functions on LanceDB cloud" - When using embedding functions with LanceDB cloud, the embeddings will be generated on the source device and sent to the cloud. This means that the source device must have the necessary resources to generate the embeddings. - -!!! warning - Using the embedding function registry means that you don't have to explicitly generate the embeddings yourself. - However, if your embedding function changes, you'll have to re-configure your table with the new embedding function - and regenerate the embeddings. In the future, we plan to support the ability to change the embedding function via - table metadata and have LanceDB automatically take care of regenerating the embeddings. - - -## 1. Define the embedding function - -=== "Python" - In the LanceDB python SDK, we define a global embedding function registry with - many different embedding models and even more coming soon. - Here's let's an implementation of CLIP as example. - - ```python - from lancedb.embeddings import get_registry - - registry = get_registry() - clip = registry.get("open-clip").create() - ``` - - You can also define your own embedding function by implementing the `EmbeddingFunction` - abstract base interface. It subclasses Pydantic Model which can be utilized to write complex schemas simply as we'll see next! - -=== "TypeScript" - In the TypeScript SDK, the choices are more limited. For now, only the OpenAI - embedding function is available. - - ```javascript - import * as lancedb from '@lancedb/lancedb' - import { getRegistry } from '@lancedb/lancedb/embeddings' - - // You need to provide an OpenAI API key - const apiKey = "sk-..." - // The embedding function will create embeddings for the 'text' column - const func = getRegistry().get("openai").create({apiKey}) - ``` -=== "Rust" - In the Rust SDK, the choices are more limited. For now, only the OpenAI - embedding function is available. But unlike the Python and TypeScript SDKs, you need manually register the OpenAI embedding function. - - ```toml - // Make sure to include the `openai` feature - [dependencies] - lancedb = {version = "*", features = ["openai"]} - ``` - - ```rust - --8<-- "rust/lancedb/examples/openai.rs:imports" - --8<-- "rust/lancedb/examples/openai.rs:openai_embeddings" - ``` - -## 2. Define the data model or schema - -=== "Python" - The embedding function defined above abstracts away all the details about the models and dimensions required to define the schema. You can simply set a field as **source** or **vector** column. Here's how: - - ```python - class Pets(LanceModel): - vector: Vector(clip.ndims()) = clip.VectorField() - image_uri: str = clip.SourceField() - ``` - - `VectorField` tells LanceDB to use the clip embedding function to generate query embeddings for the `vector` column and `SourceField` ensures that when adding data, we automatically use the specified embedding function to encode `image_uri`. - -=== "TypeScript" - - For the TypeScript SDK, a schema can be inferred from input data, or an explicit - Arrow schema can be provided. - -## 3. Create table and add data - -Now that we have chosen/defined our embedding function and the schema, -we can create the table and ingest data without needing to explicitly generate -the embeddings at all: - -=== "Python" - ```python - db = lancedb.connect("~/lancedb") - table = db.create_table("pets", schema=Pets) - - table.add([{"image_uri": u} for u in uris]) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/embedding.test.ts:imports" - --8<-- "nodejs/examples/embedding.test.ts:embedding_function" - ``` - - === "vectordb (deprecated)" - - ```ts - const db = await lancedb.connect("data/sample-lancedb"); - const data = [ - { text: "pepperoni"}, - { text: "pineapple"} - ] - - const table = await db.createTable("vectors", data, embedding) - ``` - -## 4. Querying your table -Not only can you forget about the embeddings during ingestion, you also don't -need to worry about it when you query the table: - -=== "Python" - - Our OpenCLIP query embedding function supports querying via both text and images: - - ```python - results = ( - table.search("dog") - .limit(10) - .to_pandas() - ) - ``` - - Or we can search using an image: - - ```python - p = Path("path/to/images/samoyed_100.jpg") - query_image = Image.open(p) - results = ( - table.search(query_image) - .limit(10) - .to_pandas() - ) - ``` - - Both of the above snippet returns a pandas DataFrame with the 10 closest vectors to the query. - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - const results = await table.search("What's the best pizza topping?") - .limit(10) - .toArray() - ``` - - === "vectordb (deprecated)" - - ```ts - const results = await table - .search("What's the best pizza topping?") - .limit(10) - .execute() - ``` - - The above snippet returns an array of records with the top 10 nearest neighbors to the query. - ---- - -## Rate limit Handling -`EmbeddingFunction` class wraps the calls for source and query embedding generation inside a rate limit handler that retries the requests with exponential backoff after successive failures. By default, the maximum retires is set to 7. You can tune it by setting it to a different number, or disable it by setting it to 0. - -An example of how to do this is shown below: - -```python -clip = registry.get("open-clip").create() # Defaults to 7 max retries -clip = registry.get("open-clip").create(max_retries=10) # Increase max retries to 10 -clip = registry.get("open-clip").create(max_retries=0) # Retries disabled -``` - -!!! note - Embedding functions can also fail due to other errors that have nothing to do with rate limits. - This is why the error is also logged. - -## Some fun with Pydantic - -LanceDB is integrated with Pydantic, which was used in the example above to define the schema in Python. It's also used behind the scenes by the embedding function API to ingest useful information as table metadata. - -You can also use the integration for adding utility operations in the schema. For example, in our multi-modal example, you can search images using text or another image. Let's define a utility function to plot the image. - -```python -class Pets(LanceModel): - vector: Vector(clip.ndims()) = clip.VectorField() - image_uri: str = clip.SourceField() - - @property - def image(self): - return Image.open(self.image_uri) -``` -Now, you can covert your search results to a Pydantic model and use this property. - -```python -rs = table.search(query_image).limit(3).to_pydantic(Pets) -rs[2].image -``` - -![](../assets/dog_clip_output.png) - -Now that you have the basic idea about LanceDB embedding functions and the embedding function registry, -let's dive deeper into defining your own [custom functions](./custom_embedding_function.md). diff --git a/docs/src/embeddings/index.md b/docs/src/embeddings/index.md deleted file mode 100644 index 28185491..00000000 --- a/docs/src/embeddings/index.md +++ /dev/null @@ -1,132 +0,0 @@ -Due to the nature of vector embeddings, they can be used to represent any kind of data, from text to images to audio. -This makes them a very powerful tool for machine learning practitioners. -However, there's no one-size-fits-all solution for generating embeddings - there are many different libraries and APIs -(both commercial and open source) that can be used to generate embeddings from structured/unstructured data. - -LanceDB supports 3 methods of working with embeddings. - -1. You can manually generate embeddings for the data and queries. This is done outside of LanceDB. -2. You can use the built-in [embedding functions](./embedding_functions.md) to embed the data and queries in the background. -3. You can define your own [custom embedding function](./custom_embedding_function.md) - that extends the default embedding functions. - -For python users, there is also a legacy [with_embeddings API](./legacy.md). -It is retained for compatibility and will be removed in a future version. - -## Quickstart - -To get started with embeddings, you can use the built-in embedding functions. - -### OpenAI Embedding function - -LanceDB registers the OpenAI embeddings function in the registry as `openai`. You can pass any supported model name to the `create`. By default it uses `"text-embedding-ada-002"`. - -=== "Python" - - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - - db = lancedb.connect("/tmp/db") - func = get_registry().get("openai").create(name="text-embedding-ada-002") - - class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - - table = db.create_table("words", schema=Words, mode="overwrite") - table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - - query = "greetings" - actual = table.search(query).limit(1).to_pydantic(Words)[0] - print(actual.text) - ``` - -=== "TypeScript" - - ```typescript - --8<--- "nodejs/examples/embedding.test.ts:imports" - --8<--- "nodejs/examples/embedding.test.ts:openai_embeddings" - ``` - -=== "Rust" - - ```rust - --8<--- "rust/lancedb/examples/openai.rs:imports" - --8<--- "rust/lancedb/examples/openai.rs:openai_embeddings" - ``` - -### Sentence Transformers Embedding function -LanceDB registers the Sentence Transformers embeddings function in the registry as `sentence-transformers`. You can pass any supported model name to the `create`. By default it uses `"sentence-transformers/paraphrase-MiniLM-L6-v2"`. - -=== "Python" - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - - db = lancedb.connect("/tmp/db") - model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") - - class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - - table = db.create_table("words", schema=Words) - table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - - query = "greetings" - actual = table.search(query).limit(1).to_pydantic(Words)[0] - print(actual.text) - ``` - -=== "TypeScript" - - Coming Soon! - -=== "Rust" - - Coming Soon! - -### Embedding function with LanceDB cloud -Embedding functions are now supported on LanceDB cloud. The embeddings will be generated on the source device and sent to the cloud. This means that the source device must have the necessary resources to generate the embeddings. Here's an example using the OpenAI embedding function: - -```python -import os -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry -os.environ['OPENAI_API_KEY'] = "..." - -db = lancedb.connect( - uri="db://....", - api_key="sk_...", - region="us-east-1" -) -func = get_registry().get("openai").create() - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words) -table.add([ - {"text": "hello world"}, - {"text": "goodbye world"} -]) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` diff --git a/docs/src/embeddings/legacy.md b/docs/src/embeddings/legacy.md deleted file mode 100644 index a22ab0f4..00000000 --- a/docs/src/embeddings/legacy.md +++ /dev/null @@ -1,99 +0,0 @@ -The legacy `with_embeddings` API is for Python only and is deprecated. - -### Hugging Face - -The most popular open source option is to use the [sentence-transformers](https://www.sbert.net/) -library, which can be installed via pip. - -```bash -pip install sentence-transformers -``` - -The example below shows how to use the `paraphrase-albert-small-v2` model to generate embeddings -for a given document. - -```python -from sentence_transformers import SentenceTransformer - -name="paraphrase-albert-small-v2" -model = SentenceTransformer(name) - -# used for both training and querying -def embed_func(batch): - return [model.encode(sentence) for sentence in batch] -``` - - -### OpenAI - -Another popular alternative is to use an external API like OpenAI's [embeddings API](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). - -```python -import openai -import os - -# Configuring the environment variable OPENAI_API_KEY -if "OPENAI_API_KEY" not in os.environ: -# OR set the key here as a variable -openai.api_key = "sk-..." - -client = openai.OpenAI() - -def embed_func(c): - rs = client.embeddings.create(input=c, model="text-embedding-ada-002") - return [record.embedding for record in rs["data"]] -``` - - -## Applying an embedding function to data - -Using an embedding function, you can apply it to raw data -to generate embeddings for each record. - -Say you have a pandas DataFrame with a `text` column that you want embedded, -you can use the `with_embeddings` function to generate embeddings and add them to -an existing table. - -```python - import pandas as pd - from lancedb.embeddings import with_embeddings - - df = pd.DataFrame( - [ - {"text": "pepperoni"}, - {"text": "pineapple"} - ] - ) - data = with_embeddings(embed_func, df) - - # The output is used to create / append to a table - tbl = db.create_table("my_table", data=data) -``` - -If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`. - -By default, LanceDB calls the function with batches of 1000 rows. This can be configured -using the `batch_size` parameter to `with_embeddings`. - -LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI -API call is reliable. - -## Querying using an embedding function - -!!! warning - At query time, you **must** use the same embedding function you used to vectorize your data. - If you use a different embedding function, the embeddings will not reside in the same vector - space and the results will be nonsensical. - -=== "Python" - ```python - query = "What's the best pizza topping?" - query_vector = embed_func([query])[0] - results = ( - tbl.search(query_vector) - .limit(10) - .to_pandas() - ) - ``` - - The above snippet returns a pandas DataFrame with the 10 closest vectors to the query. diff --git a/docs/src/embeddings/understanding_embeddings.md b/docs/src/embeddings/understanding_embeddings.md deleted file mode 100644 index bd12db1b..00000000 --- a/docs/src/embeddings/understanding_embeddings.md +++ /dev/null @@ -1,133 +0,0 @@ -# Understand Embeddings - -The term **dimension** is a synonym for the number of elements in a feature vector. Each feature can be thought of as a different axis in a geometric space. - -High-dimensional data means there are many features(or attributes) in the data. - -!!! example - 1. An image is a data point and it might have thousands of dimensions because each pixel could be considered as a feature. - - 2. Text data, when represented by each word or character, can also lead to high dimensions, especially when considering all possible words in a language. - -Embedding captures **meaning and relationships** within data by mapping high-dimensional data into a lower-dimensional space. It captures it by placing inputs that are more **similar in meaning** closer together in the **embedding space**. - -## What are Vector Embeddings? - -Vector embeddings is a way to convert complex data, like text, images, or audio into numerical coordinates (called vectors) that can be plotted in an n-dimensional space(embedding space). - -The closer these data points are related in the real world, the closer their corresponding numerical coordinates (vectors) will be to each other in the embedding space. This proximity in the embedding space reflects their semantic similarities, allowing machines to intuitively understand and process the data in a way that mirrors human perception of relationships and meaning. - -In a way, it captures the most important aspects of the data while ignoring the less important ones. As a result, tasks like searching for related content or identifying patterns become more efficient and accurate, as the embeddings make it possible to quantify how **closely related** different **data points** are and **reduce** the **computational complexity**. - -??? question "Are vectors and embeddings the same thing?" - - When we say “vectors” we mean - **list of numbers** that **represents the data**. - When we say “embeddings” we mean - **list of numbers** that **capture important details and relationships**. - - Although the terms are often used interchangeably, “embeddings” highlight how the data is represented with meaning and structure, while “vector” simply refers to the numerical form of that representation. - -## Embedding vs Indexing - -We already saw that creating **embeddings** on data is a method of creating **vectors** for a **n-dimensional embedding space** that captures the meaning and relationships inherent in the data. - -Once we have these **vectors**, indexing comes into play. Indexing is a method of organizing these vector embeddings, that allows us to quickly and efficiently locate and retrieve them from the entire dataset of vector embeddings. - -## What types of data/objects can be embedded? - -The following are common types of data that can be embedded: - -1. **Text**: Text data includes sentences, paragraphs, documents, or any written content. -2. **Images**: Image data encompasses photographs, illustrations, or any visual content. -3. **Audio**: Audio data includes sounds, music, speech, or any auditory content. -4. **Video**: Video data consists of moving images and sound, which can convey complex information. - -Large datasets of multi-modal data (text, audio, images, etc.) can be converted into embeddings with the appropriate model. - -!!! tip "LanceDB vs Other traditional Vector DBs" - While many vector databases primarily focus on the storage and retrieval of vector embeddings, **LanceDB** uses **Lance file format** (operates on a disk-based architecture), which allows for the storage and management of not just embeddings but also **raw file data (bytes)**. This capability means that users can integrate various types of data, including images and text, alongside their vector embeddings in a unified system. - - With the ability to store both vectors and associated file data, LanceDB enhances the querying process. Users can perform semantic searches that not only retrieve similar embeddings but also access related files and metadata, thus streamlining the workflow. - -## How does embedding works? - -As mentioned, after creating embedding, each data point is represented as a vector in a n-dimensional space (embedding space). The dimensionality of this space can vary depending on the complexity of the data and the specific embedding technique used. - -Points that are close to each other in vector space are considered similar (or appear in similar contexts), and points that are far away are considered dissimilar. To quantify this closeness, we use distance as a metric which can be measured in the following way - - -1. **Euclidean Distance (l2)**: It calculates the straight-line distance between two points (vectors) in a multidimensional space. -2. **Cosine Similarity**: It measures the cosine of the angle between two vectors, providing a normalized measure of similarity based on their direction. -3. **Dot product**: It is calculated as the sum of the products of their corresponding components. To measure relatedness it considers both the magnitude and direction of the vectors. - -## How do you create and store vector embeddings for your data? - -1. **Creating embeddings**: Choose an embedding model, it can be a pre-trained model (open-source or commercial) or you can train a custom embedding model for your scenario. Then feed your preprocessed data into the chosen model to obtain embeddings. - -??? question "Popular choices for embedding models" - For text data, popular choices are OpenAI’s text-embedding models, Google Gemini text-embedding models, Cohere’s Embed models, and SentenceTransformers, etc. - - For image data, popular choices are CLIP (Contrastive Language–Image Pretraining), Imagebind embeddings by meta (supports audio, video, and image), and Jina multi-modal embeddings, etc. - -2. **Storing vector embeddings**: This effectively requires **specialized databases** that can handle the complexity of vector data, as traditional databases often struggle with this task. Vector databases are designed specifically for storing and querying vector embeddings. They optimize for efficient nearest-neighbor searches and provide built-in indexing mechanisms. - -!!! tip "Why LanceDB" - LanceDB **automates** the entire process of creating and storing embeddings for your data. LanceDB allows you to define and use **embedding functions**, which can be **pre-trained models** or **custom models**. - - This enables you to **generate** embeddings tailored to the nature of your data (e.g., text, images) and **store** both the **original data** and **embeddings** in a **structured schema** thus providing efficient querying capabilities for similarity searches. - -Let's quickly [get started](./index.md) and learn how to manage embeddings in LanceDB. - -## Bonus: As a developer, what you can create using embeddings? - -As a developer, you can create a variety of innovative applications using vector embeddings. Check out the following - - -
- -- __Chatbots__ - - --- - - Develop chatbots that utilize embeddings to retrieve relevant context and generate coherent, contextually aware responses to user queries. - - [:octicons-arrow-right-24: Check out examples](../examples/python_examples/chatbot.md) - -- __Recommendation Systems__ - - --- - - Develop systems that recommend content (such as articles, movies, or products) based on the similarity of keywords and descriptions, enhancing user experience. - - [:octicons-arrow-right-24: Check out examples](../examples/python_examples/recommendersystem.md) - -- __Vector Search__ - - --- - - Build powerful applications that harness the full potential of semantic search, enabling them to retrieve relevant data quickly and effectively. - - [:octicons-arrow-right-24: Check out examples](../examples/python_examples/vector_search.md) - -- __RAG Applications__ - - --- - - Combine the strengths of large language models (LLMs) with retrieval-based approaches to create more useful applications. - - [:octicons-arrow-right-24: Check out examples](../examples/python_examples/rag.md) - -- __Many more examples__ - - --- - - Explore applied examples available as Colab notebooks or Python scripts to integrate into your applications. - - [:octicons-arrow-right-24: More](../examples/examples_python.md) - -
- - - - - - - - diff --git a/docs/src/embeddings/variables_and_secrets.md b/docs/src/embeddings/variables_and_secrets.md deleted file mode 100644 index 72388b24..00000000 --- a/docs/src/embeddings/variables_and_secrets.md +++ /dev/null @@ -1,53 +0,0 @@ -# Variable and Secrets - -Most embedding configuration options are saved in the table's metadata. However, -this isn't always appropriate. For example, API keys should never be stored in the -metadata. Additionally, other configuration options might be best set at runtime, -such as the `device` configuration that controls whether to use GPU or CPU for -inference. If you hardcoded this to GPU, you wouldn't be able to run the code on -a server without one. - -To handle these cases, you can set variables on the embedding registry and -reference them in the embedding configuration. These variables will be available -during the runtime of your program, but not saved in the table's metadata. When -the table is loaded from a different process, the variables must be set again. - -To set a variable, use the `set_var()` / `setVar()` method on the embedding registry. -To reference a variable, use the syntax `$env:VARIABLE_NAME`. If there is a default -value, you can use the syntax `$env:VARIABLE_NAME:DEFAULT_VALUE`. - -## Using variables to set secrets - -Sensitive configuration, such as API keys, must either be set as environment -variables or using variables on the embedding registry. If you pass in a hardcoded -value, LanceDB will raise an error. Instead, if you want to set an API key via -configuration, use a variable: - -=== "Python" - - ```python - --8<-- "python/python/tests/docs/test_embeddings_optional.py:register_secret" - ``` - -=== "Typescript" - - ```typescript - --8<-- "nodejs/examples/embedding.test.ts:register_secret" - ``` - -## Using variables to set the device parameter - -Many embedding functions that run locally have a `device` parameter that controls -whether to use GPU or CPU for inference. Because not all computers have a GPU, -it's helpful to be able to set the `device` parameter at runtime, rather than -have it hard coded in the embedding configuration. To make it work even if the -variable isn't set, you could provide a default value of `cpu` in the embedding -configuration. - -Some embedding libraries even have a method to detect which devices are available, -which could be used to dynamically set the device at runtime. For example, in Python -you can check if a CUDA GPU is available using `torch.cuda.is_available()`. - -```python ---8<-- "python/python/tests/docs/test_embeddings_optional.py:register_device" -``` diff --git a/docs/src/examples/code_documentation_qa_bot_with_langchain.md b/docs/src/examples/code_documentation_qa_bot_with_langchain.md deleted file mode 100644 index 955d7fd9..00000000 --- a/docs/src/examples/code_documentation_qa_bot_with_langchain.md +++ /dev/null @@ -1,7 +0,0 @@ -# Code documentation Q&A bot with LangChain - -## use LanceDB's LangChain integration to build a Q&A bot for your documentation - -langchain - -This example is in a [notebook](https://github.com/lancedb/lancedb/blob/main/docs/src/notebooks/code_qa_bot.ipynb) diff --git a/docs/src/examples/examples_js.md b/docs/src/examples/examples_js.md deleted file mode 100644 index b7614472..00000000 --- a/docs/src/examples/examples_js.md +++ /dev/null @@ -1,11 +0,0 @@ -# Examples: JavaScript - -To help you get started, we provide some examples, projects and applications that use the LanceDB JavaScript API. You can always find the latest examples in our [VectorDB Recipes](https://github.com/lancedb/vectordb-recipes) repository. - -| Example | Scripts | -|-------- | ------ | -| | | -| [Youtube transcript search bot](https://github.com/lancedb/vectordb-recipes/tree/main/examples/youtube_bot/) | [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/youtube_bot/index.js)| -| [Langchain: Code Docs QA bot](https://github.com/lancedb/vectordb-recipes/tree/main/examples/Code-Documentation-QA-Bot/) | [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/Code-Documentation-QA-Bot/index.js)| -| [AI Agents: Reducing Hallucination](https://github.com/lancedb/vectordb-recipes/tree/main/examples/reducing_hallucinations_ai_agents/) | [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/reducing_hallucinations_ai_agents/index.js)| -| [TransformersJS Embedding example](https://github.com/lancedb/vectordb-recipes/tree/main/examples/js-transformers/) | [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/js-transformers/index.js) | diff --git a/docs/src/examples/examples_python.md b/docs/src/examples/examples_python.md deleted file mode 100644 index 6ffe972f..00000000 --- a/docs/src/examples/examples_python.md +++ /dev/null @@ -1,22 +0,0 @@ -# Overview : Python Examples - -To help you get started, we provide some examples, projects, and applications that use the LanceDB Python API. These examples are designed to get you right into the code with minimal introduction, enabling you to move from an idea to a proof of concept in minutes. - -You can find the latest examples in our [VectorDB Recipes](https://github.com/lancedb/vectordb-recipes) repository. - -**Introduction** - -Explore applied examples available as Colab notebooks or Python scripts to integrate into your applications. You can also checkout our blog posts related to the particular example for deeper understanding. - -| Explore | Description | -|----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [**Build from Scratch with LanceDB** 🛠️🚀](python_examples/build_from_scratch.md) | Start building your **GenAI applications** from the **ground up** using **LanceDB's** efficient vector-based document retrieval capabilities! Get started quickly with a solid foundation. | -| [**Multimodal Search with LanceDB** 🤹‍♂️🔍](python_examples/multimodal.md) | Combine **text** and **image queries** to find the most relevant results using **LanceDB’s multimodal** capabilities. Leverage the efficient vector-based similarity search. | -| [**RAG (Retrieval-Augmented Generation) with LanceDB** 🔓🧐](python_examples/rag.md) | Build RAG (Retrieval-Augmented Generation) with **LanceDB** for efficient **vector-based information retrieval** and more accurate responses from AI. | -| [**Vector Search: Efficient Retrieval** 🔓👀](python_examples/vector_search.md) | Use **LanceDB's** vector search capabilities to perform efficient and accurate **similarity searches**, enabling rapid discovery and retrieval of relevant documents in Large datasets. | -| [**Chatbot applications with LanceDB** 🤖](python_examples/chatbot.md) | Create **chatbots** that retrieves relevant context for **coherent and context-aware replies**, enhancing user experience through advanced conversational AI. | -| [**Evaluation: Assessing Text Performance with Precision** 📊💡](python_examples/evaluations.md) | Develop **evaluation** applications that allows you to input reference and candidate texts to **measure** their performance across various metrics. | -| [**AI Agents: Intelligent Collaboration** 🤖](python_examples/aiagent.md) | Enable **AI agents** to communicate and collaborate efficiently through dense vector representations, achieving shared goals seamlessly. | -| [**Recommender Systems: Personalized Discovery** 🍿📺](python_examples/recommendersystem.md) | Deliver **personalized experiences** by efficiently storing and querying item embeddings with **LanceDB's** powerful vector database capabilities. | -| **Miscellaneous Examples🌟** | Find other **unique examples** and **creative solutions** using **LanceDB**, showcasing the flexibility and broad applicability of the platform. | - diff --git a/docs/src/examples/examples_rust.md b/docs/src/examples/examples_rust.md deleted file mode 100644 index aa1ae0df..00000000 --- a/docs/src/examples/examples_rust.md +++ /dev/null @@ -1,3 +0,0 @@ -# Examples: Rust - -Our Rust SDK is now stable. Examples are coming soon. diff --git a/docs/src/examples/image_embeddings_roboflow.md b/docs/src/examples/image_embeddings_roboflow.md deleted file mode 100644 index f4056af8..00000000 --- a/docs/src/examples/image_embeddings_roboflow.md +++ /dev/null @@ -1,165 +0,0 @@ -# How to Load Image Embeddings into LanceDB - -With the rise of Large Multimodal Models (LMMs) such as [GPT-4 Vision](https://blog.roboflow.com/gpt-4-vision/), the need for storing image embeddings is growing. The most effective way to store text and image embeddings is in a vector database such as LanceDB. Vector databases are a special kind of data store that enables efficient search over stored embeddings. - -[CLIP](https://blog.roboflow.com/openai-clip/), a multimodal model developed by OpenAI, is commonly used to calculate image embeddings. These embeddings can then be used with a vector database to build a semantic search engine that you can query using images or text. For example, you could use LanceDB and CLIP embeddings to build a search engine for a database of folders. - -In this guide, we are going to show you how to use Roboflow Inference to load image embeddings into LanceDB. Without further ado, let’s get started! - -## Step #1: Install Roboflow Inference - -[Roboflow Inference](https://inference.roboflow.com) enables you to run state-of-the-art computer vision models with minimal configuration. Inference supports a range of models, from fine-tuned object detection, classification, and segmentation models to foundation models like CLIP. We will use Inference to calculate CLIP image embeddings. - -Inference provides a HTTP API through which you can run vision models. - -Inference powers the Roboflow hosted API, and is available as an open source utility. In this guide, we are going to run Inference locally, which enables you to calculate CLIP embeddings on your own hardware. We will also show you how to use the hosted Roboflow CLIP API, which is ideal if you need to scale and do not want to manage a system for calculating embeddings. - -To get started, first install the Inference CLI: - -``` -pip install inference-cli -``` - -Next, install Docker. Refer to the official Docker installation instructions for your operating system to get Docker set up. Once Docker is ready, you can start Inference using the following command: - -``` -inference server start -``` - -An Inference server will start running at ‘http://localhost:9001’. - -## Step #2: Set Up a LanceDB Vector Database - -Now that we have Inference running, we can set up a LanceDB vector database. You can run LanceDB in JavaScript and Python. For this guide, we will use the Python API. But, you can take the HTTP requests we make below and change them to JavaScript if required. - -For this guide, we are going to search the [COCO 128 dataset](https://universe.roboflow.com/team-roboflow/coco-128), which contains a wide range of objects. The variability in objects present in this dataset makes it a good dataset to demonstrate the capabilities of vector search. If you want to use this dataset, you can download [COCO 128 from Roboflow Universe](https://universe.roboflow.com/team-roboflow/coco-128). With that said, you can search whatever folder of images you want. - -Once you have a dataset ready, install LanceDB with the following command: - -``` -pip install lancedb -``` - -We also need to install a specific commit of `tantivy`, a dependency of the LanceDB full text search engine we will use later in this guide: - -``` -pip install tantivy -``` - -Create a new Python file and add the following code: - -```python -import cv2 -import supervision as sv -import requests - -import lancedb - -db = lancedb.connect("./embeddings") - -IMAGE_DIR = "images/" -API_KEY = os.environ.get("ROBOFLOW_API_KEY") -SERVER_URL = "http://localhost:9001" - -results = [] - -for i, image in enumerate(os.listdir(IMAGE_DIR)): - infer_clip_payload = { - #Images can be provided as urls or as base64 encoded strings - "image": { - "type": "base64", - "value": base64.b64encode(open(IMAGE_DIR + image, "rb").read()).decode("utf-8"), - }, - } - - res = requests.post( - f"{SERVER_URL}/clip/embed_image?api_key={API_KEY}", - json=infer_clip_payload, - ) - - embeddings = res.json()['embeddings'] - - print("Calculated embedding for image: ", image) - - image = {"vector": embeddings[0], "name": os.path.join(IMAGE_DIR, image)} - - results.append(image) - -tbl = db.create_table("images", data=results) - -tbl.create_fts_index("name") -``` - -To use the code above, you will need a Roboflow API key. [Learn how to retrieve a Roboflow API key](https://docs.roboflow.com/api-reference/authentication#retrieve-an-api-key). Run the following command to set up your API key in your environment: - -``` -export ROBOFLOW_API_KEY="" -``` - -Replace the `IMAGE_DIR` value with the folder in which you are storing the images for which you want to calculate embeddings. If you want to use the Roboflow CLIP API to calculate embeddings, replace the `SERVER_URL` value with `https://infer.roboflow.com`. - -Run the script above to create a new LanceDB database. This database will be stored on your local machine. The database will be called `embeddings` and the table will be called `images`. - -The script above calculates all embeddings for a folder then creates a new table. To add additional images, use the following code: - -```python -def make_batches(): - for i in range(5): - yield [ - {"vector": [3.1, 4.1], "name": "image1.png"}, - {"vector": [5.9, 26.5], "name": "image2.png"} - ] - -tbl = db.open_table("images") -tbl.add(make_batches()) -``` - -Replacing the `make_batches()` function with code to load embeddings for images. - -## Step #3: Run a Search Query - -We are now ready to run a search query. To run a search query, we need a text embedding that represents a text query. We can use this embedding to search our LanceDB database for an entry. - -Let’s calculate a text embedding for the query “cat”, then run a search query: - -```python -infer_clip_payload = { - "text": "cat", -} - -res = requests.post( - f"{SERVER_URL}/clip/embed_text?api_key={API_KEY}", - json=infer_clip_payload, -) - -embeddings = res.json()['embeddings'] - -df = tbl.search(embeddings[0]).limit(3).to_list() - -print("Results:") - -for i in df: - print(i["name"]) -``` - -This code will search for the three images most closely related to the prompt “cat”. The names of the most similar three images will be printed to the console. Here are the three top results: - -``` -dataset/images/train/000000000650_jpg.rf.1b74ba165c5a3513a3211d4a80b69e1c.jpg -dataset/images/train/000000000138_jpg.rf.af439ef1c55dd8a4e4b142d186b9c957.jpg -dataset/images/train/000000000165_jpg.rf.eae14d5509bf0c9ceccddbb53a5f0c66.jpg -``` - -Let’s open the top image: - -![Cat](https://media.roboflow.com/cat_lancedb.jpg) - -The top image was a cat. Our search was successful. - -## Conclusion - -LanceDB is a vector database that you can use to store and efficiently search your image embeddings. You can use Roboflow Inference, a scalable computer vision inference server, to calculate CLIP embeddings that you can store in LanceDB. - -You can use Inference and LanceDB together to build a range of applications with image embeddings, from a media search engine to a retrieval-augmented generation pipeline for use with LMMs. - -To learn more about Inference and its capabilities, refer to the Inference documentation. \ No newline at end of file diff --git a/docs/src/examples/index.md b/docs/src/examples/index.md deleted file mode 100644 index 6e505a2f..00000000 --- a/docs/src/examples/index.md +++ /dev/null @@ -1,12 +0,0 @@ -# Example projects and recipes - -## Recipes and example code - -LanceDB provides language APIs, allowing you to embed a database in your language of choice. - -* 🐍 [Python](examples_python.md) examples -* 👾 [JavaScript](examples_js.md) examples -* 🦀 Rust examples (coming soon) - -!!! tip "Hosted LanceDB" - If you want S3 cost-efficiency and local performance via a simple serverless API, checkout **LanceDB Cloud**. For private deployments, high performance at extreme scale, or if you have strict security requirements, talk to us about **LanceDB Enterprise**. [Learn more](https://docs.lancedb.com/) \ No newline at end of file diff --git a/docs/src/examples/modal_langchain.py b/docs/src/examples/modal_langchain.py deleted file mode 100644 index c664547e..00000000 --- a/docs/src/examples/modal_langchain.py +++ /dev/null @@ -1,119 +0,0 @@ -import pickle -import re -import zipfile -from pathlib import Path - -import requests -from langchain.chains import RetrievalQA -from langchain.document_loaders import UnstructuredHTMLLoader -from langchain.embeddings import OpenAIEmbeddings -from langchain.llms import OpenAI -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores import LanceDB -from modal import Image, Secret, Stub, web_endpoint - -import lancedb - -lancedb_image = Image.debian_slim().pip_install( - "lancedb", "langchain", "openai", "pandas", "tiktoken", "unstructured", "tabulate" -) - -stub = Stub( - name="example-langchain-lancedb", - image=lancedb_image, - secrets=[Secret.from_name("my-openai-secret")], -) - -docsearch = None -docs_path = Path("docs.pkl") -db_path = Path("lancedb") - - -def get_document_title(document): - m = str(document.metadata["source"]) - title = re.findall("pandas.documentation(.*).html", m) - if title[0] is not None: - return title[0] - return "" - - -def download_docs(): - pandas_docs = requests.get( - "https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip" - ) - with open(Path("pandas.documentation.zip"), "wb") as f: - f.write(pandas_docs.content) - - file = zipfile.ZipFile(Path("pandas.documentation.zip")) - file.extractall(path=Path("pandas_docs")) - - -def store_docs(): - docs = [] - - if not docs_path.exists(): - for p in Path("pandas_docs/pandas.documentation").rglob("*.html"): - if p.is_dir(): - continue - loader = UnstructuredHTMLLoader(p) - raw_document = loader.load() - - m = {} - m["title"] = get_document_title(raw_document[0]) - m["version"] = "2.0rc0" - raw_document[0].metadata = raw_document[0].metadata | m - raw_document[0].metadata["source"] = str(raw_document[0].metadata["source"]) - docs = docs + raw_document - - with docs_path.open("wb") as fh: - pickle.dump(docs, fh) - else: - with docs_path.open("rb") as fh: - docs = pickle.load(fh) - - return docs - - -def qanda_langchain(query): - download_docs() - docs = store_docs() - - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=200, - ) - documents = text_splitter.split_documents(docs) - embeddings = OpenAIEmbeddings() - - db = lancedb.connect(db_path) - table = db.create_table( - "pandas_docs", - data=[ - { - "vector": embeddings.embed_query("Hello World"), - "text": "Hello World", - "id": "1", - } - ], - mode="overwrite", - ) - docsearch = LanceDB.from_documents(documents, embeddings, connection=table) - qa = RetrievalQA.from_chain_type( - llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever() - ) - return qa.run(query) - - -@stub.function() -@web_endpoint(method="GET") -def web(query: str): - answer = qanda_langchain(query) - return { - "answer": answer, - } - - -@stub.function() -def cli(query: str): - answer = qanda_langchain(query) - print(answer) diff --git a/docs/src/examples/multimodal_search.md b/docs/src/examples/multimodal_search.md deleted file mode 100644 index 1665d802..00000000 --- a/docs/src/examples/multimodal_search.md +++ /dev/null @@ -1,7 +0,0 @@ -# Image multimodal search - -## Search through an image dataset using natural language, full text and SQL - -multimodal search - -This example is in a [notebook](https://github.com/lancedb/lancedb/blob/main/docs/src/notebooks/multimodal_search.ipynb) diff --git a/docs/src/examples/python_examples/aiagent.md b/docs/src/examples/python_examples/aiagent.md deleted file mode 100644 index bcb2eb20..00000000 --- a/docs/src/examples/python_examples/aiagent.md +++ /dev/null @@ -1,27 +0,0 @@ -# AI Agents: Intelligent Collaboration🤖 - -Think of a platform where AI Agents can seamlessly exchange information, coordinate over tasks, and achieve shared targets with great efficiency💻📈. - -## Vector-Based Coordination: The Technical Advantage -Leveraging LanceDB's vector-based capabilities, we can enable **AI agents 🤖** to communicate and collaborate through dense vector representations. AI agents can exchange information, coordinate on a task or work towards a common goal, just by giving queries📝. - -| **AI Agents** | **Description** | **Links** | -|:--------------|:----------------|:----------| -| **AI Agents: Reducing Hallucinationt📊** | 🤖💡 **Reduce AI hallucinations** using Critique-Based Contexting! Learn by Simplifying and Automating tedious workflows by going through fitness trainer agent example.💪 | [![Github](../../assets/github.svg)][hullucination_github]
[![Open In Collab](../../assets/colab.svg)][hullucination_colab]
[![Python](../../assets/python.svg)][hullucination_python]
[![Ghost](../../assets/ghost.svg)][hullucination_ghost] | -| **AI Trends Searcher: CrewAI🔍️** | 🔍️ Learn about **CrewAI Agents** ! Utilize the features of CrewAI - Role-based Agents, Task Management, and Inter-agent Delegation ! Make AI agents work together to do tricky stuff 😺| [![Github](../../assets/github.svg)][trend_github]
[![Open In Collab](../../assets/colab.svg)][trend_colab]
[![Ghost](../../assets/ghost.svg)][trend_ghost] | -| **SuperAgent Autogen🤖** | 💻 AI interactions with the Super Agent! Integrating **Autogen**, **LanceDB**, **LangChain**, **LiteLLM**, and **Ollama** to create AI agent that excels in understanding and processing complex queries.🤖 | [![Github](../../assets/github.svg)][superagent_github]
[![Open In Collab](../../assets/colab.svg)][superagent_colab] | - - -[hullucination_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents -[hullucination_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents/main.ipynb -[hullucination_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents/main.py -[hullucination_ghost]: https://blog.lancedb.com/how-to-reduce-hallucinations-from-llm-powered-agents-using-long-term-memory-72f262c3cc1f/ - -[trend_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/AI-Trends-with-CrewAI -[trend_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/AI-Trends-with-CrewAI/CrewAI_AI_Trends.ipynb -[trend_ghost]: https://blog.lancedb.com/track-ai-trends-crewai-agents-rag/ - -[superagent_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/SuperAgent_Autogen -[superagent_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/SuperAgent_Autogen/main.ipynb - - diff --git a/docs/src/examples/python_examples/build_from_scratch.md b/docs/src/examples/python_examples/build_from_scratch.md deleted file mode 100644 index 7019a810..00000000 --- a/docs/src/examples/python_examples/build_from_scratch.md +++ /dev/null @@ -1,13 +0,0 @@ -# **Build from Scratch with LanceDB 🛠️🚀** - -Start building your GenAI applications from the ground up using **LanceDB's** efficient vector-based document retrieval capabilities! 📑 - -**Get Started in Minutes ⏱️** - -These examples provide a solid foundation for building your own GenAI applications using LanceDB. Jump from idea to **proof of concept** quickly with applied examples. Get started and see what you can create! 💻 - -| **Build From Scratch** | **Description** | **Links** | -|:-------------------------------------------|:-------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **Build RAG from Scratch🚀💻** | 📝 Create a **Retrieval-Augmented Generation** (RAG) model from scratch using LanceDB. | [![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/lancedb/vectordb-recipes/tree/main/tutorials/RAG-from-Scratch)
[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)]() | -| **Local RAG from Scratch with Llama3🔥💡** | 🐫 Build a local RAG model using **Llama3** and **LanceDB** for fast and efficient text generation. | [![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/lancedb/vectordb-recipes/tree/main/tutorials/Local-RAG-from-Scratch)
[![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/Local-RAG-from-Scratch/rag.py) | -| **Multi-Head RAG from Scratch📚💻** | 🤯 Develop a **Multi-Head RAG model** from scratch, enabling generation of text based on multiple documents. | [![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/lancedb/vectordb-recipes/tree/main/tutorials/Multi-Head-RAG-from-Scratch)
[![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/tutorials/Multi-Head-RAG-from-Scratch) | diff --git a/docs/src/examples/python_examples/chatbot.md b/docs/src/examples/python_examples/chatbot.md deleted file mode 100644 index 52d4e404..00000000 --- a/docs/src/examples/python_examples/chatbot.md +++ /dev/null @@ -1,41 +0,0 @@ -**Chatbot applications with LanceDB 🤖** -==================================================================== - - Create innovative chatbot applications that utilizes LanceDB for efficient vector-based response generation! 🌐✨ - -**Introduction 👋✨** - - Users can input their queries, allowing the chatbot to retrieve relevant context seamlessly. 🔍📚 This enables the generation of coherent and context-aware replies that enhance user experience. 🌟🤝 Dive into the world of advanced conversational AI and streamline interactions with powerful data management! 🚀💡 - - -| **Chatbot** | **Description** | **Links** | -|:----------------|:-----------------|:-----------| -| **Databricks DBRX Website Bot ⚡️** | Engage with the **Hogwarts chatbot**, that uses Open-source RAG with **DBRX**, **LanceDB** and **LLama-index with Hugging Face Embeddings**, to provide interactive and engaging user experiences. ✨ | [![GitHub](../../assets/github.svg)][databricks_github]
[![Python](../../assets/python.svg)][databricks_python] | -| **CLI SDK Manual Chatbot Locally 💻** | CLI chatbot for SDK/hardware documents using **Local RAG** with **LLama3**, **Ollama**, **LanceDB**, and **Openhermes Embeddings**, built with **Phidata** Assistant and Knowledge Base 🤖 | [![GitHub](../../assets/github.svg)][clisdk_github]
[![Python](../../assets/python.svg)][clisdk_python] | -| **Youtube Transcript Search QA Bot 📹** | Search through **youtube transcripts** using natural language with a Q&A bot, leveraging **LanceDB** for effortless data storage and management 💬 | [![GitHub](../../assets/github.svg)][youtube_github]
[![Open In Collab](../../assets/colab.svg)][youtube_colab]
[![Python](../../assets/python.svg)][youtube_python] | -| **Code Documentation Q&A Bot with LangChain 🤖** | Query your own documentation easily using questions in natural language with a Q&A bot, powered by **LangChain** and **LanceDB**, demonstrated with **Numpy 1.26 docs** 📚 | [![GitHub](../../assets/github.svg)][docs_github]
[![Open In Collab](../../assets/colab.svg)][docs_colab]
[![Python](../../assets/python.svg)][docs_python] | -| **Context-aware Chatbot using Llama 2 & LanceDB 🤖** | Build **conversational AI** with a **context-aware chatbot**, powered by **Llama 2**, **LanceDB**, and **LangChain**, that enables intuitive and meaningful conversations with your data 📚💬 | [![GitHub](../../assets/github.svg)][aware_github]
[![Open In Collab](../../assets/colab.svg)][aware_colab]
[![Ghost](../../assets/ghost.svg)][aware_ghost] | -| **Chat with csv using Hybrid Search 📊** | **Chat** application that interacts with **CSV** and **Excel files** using **LanceDB’s** hybrid search capabilities, performing direct operations on large-scale columnar data efficiently 🚀 | [![GitHub](../../assets/github.svg)][csv_github]
[![Open In Collab](../../assets/colab.svg)][csv_colab]
[![Ghost](../../assets/ghost.svg)][csv_ghost] | - - -[databricks_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/databricks_DBRX_website_bot -[databricks_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/databricks_DBRX_website_bot/main.py - -[clisdk_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/CLI-SDK-Manual-Chatbot-Locally -[clisdk_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/CLI-SDK-Manual-Chatbot-Locally/assistant.py - -[youtube_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Youtube-Search-QA-Bot -[youtube_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Youtube-Search-QA-Bot/main.ipynb -[youtube_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Youtube-Search-QA-Bot/main.py - -[docs_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Code-Documentation-QA-Bot -[docs_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Code-Documentation-QA-Bot/main.ipynb -[docs_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Code-Documentation-QA-Bot/main.py - -[aware_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/chatbot_using_Llama2_&_lanceDB -[aware_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/chatbot_using_Llama2_&_lanceDB/main.ipynb -[aware_ghost]: https://blog.lancedb.com/context-aware-chatbot-using-llama-2-lancedb-as-vector-database-4d771d95c755 - -[csv_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/Chat_with_csv_file -[csv_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/Chat_with_csv_file/main.ipynb -[csv_ghost]: https://blog.lancedb.com/p/d8c71df4-e55f-479a-819e-cde13354a6a3/ diff --git a/docs/src/examples/python_examples/evaluations.md b/docs/src/examples/python_examples/evaluations.md deleted file mode 100644 index 18d0ccba..00000000 --- a/docs/src/examples/python_examples/evaluations.md +++ /dev/null @@ -1,21 +0,0 @@ -**Evaluation: Assessing Text Performance with Precision 📊💡** -==================================================================== - -Evaluation is a comprehensive tool designed to measure the performance of text-based inputs, enabling data-driven optimization and improvement 📈. - -**Text Evaluation 101 📚** - -Using robust framework for assessing reference and candidate texts across various metrics📊, ensure that the text outputs are high-quality and meet specific requirements and standards📝. - -| **Evaluation** | **Description** | **Links** | -| -------------- | --------------- | --------- | -| **Evaluating Prompts with Prompttools 🤖** | Compare, visualize & evaluate **embedding functions** (incl. OpenAI) across metrics like latency & custom evaluation 📈📊 | [![Github](../../assets/github.svg)][prompttools_github]
[![Open In Collab](../../assets/colab.svg)][prompttools_colab] | -| **Evaluating RAG with RAGAs and GPT-4o 📊** | Evaluate **RAG pipelines** with cutting-edge metrics and tools, integrate with CI/CD for continuous performance checks, and generate responses with GPT-4o 🤖📈 | [![Github](../../assets/github.svg)][RAGAs_github]
[![Open In Collab](../../assets/colab.svg)][RAGAs_colab] | - - - -[prompttools_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/prompttools-eval-prompts -[prompttools_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/prompttools-eval-prompts/main.ipynb - -[RAGAs_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Evaluating_RAG_with_RAGAs -[RAGAs_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Evaluating_RAG_with_RAGAs/Evaluating_RAG_with_RAGAs.ipynb diff --git a/docs/src/examples/python_examples/multimodal.md b/docs/src/examples/python_examples/multimodal.md deleted file mode 100644 index 69b7b778..00000000 --- a/docs/src/examples/python_examples/multimodal.md +++ /dev/null @@ -1,28 +0,0 @@ -# **Multimodal Search with LanceDB 🤹‍♂️🔍** - -Using LanceDB's multimodal capabilities, combine text and image queries to find the most relevant results in your corpus ! 🔓💡 - -**Explore the Future of Search 🚀** - -LanceDB supports multimodal search by indexing and querying vector representations of text and image data 🤖. This enables efficient retrieval of relevant documents and images using vector-based similarity search 📊. The platform facilitates cross-modal search, allowing for text-image and image-text retrieval, and supports scalable indexing of high-dimensional vector spaces 💻. - - - -| **Multimodal** | **Description** | **Links** | -|:----------------|:-----------------|:-----------| -| **Multimodal CLIP: DiffusionDB 🌐💥** | Multi-Modal Search with **CLIP** and **LanceDB** Using **DiffusionDB** Data for Combined Text and Image Understanding ! 🔓 | [![GitHub](../../assets/github.svg)][Clip_diffusionDB_github]
[![Open In Collab](../../assets/colab.svg)][Clip_diffusionDB_colab]
[![Python](../../assets/python.svg)][Clip_diffusionDB_python]
[![Ghost](../../assets/ghost.svg)][Clip_diffusionDB_ghost] | -| **Multimodal CLIP: Youtube Videos 📹👀** | Search **Youtube videos** using Multimodal CLIP, finding relevant content with ease and accuracy! 🎯 | [![Github](../../assets/github.svg)][Clip_youtube_github]
[![Open In Collab](../../assets/colab.svg)][Clip_youtube_colab]
[![Python](../../assets/python.svg)][Clip_youtube_python]
[![Ghost](../../assets/ghost.svg)][Clip_youtube_python] | -| **Multimodal Image + Text Search 📸🔍** | Find **relevant documents** and **images** with a single query using **LanceDB's** multimodal search capabilities, to seamlessly integrate text and visuals ! 🌉 | [![GitHub](../../assets/github.svg)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/multimodal_search)
[![Open In Collab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/multimodal_search/main.ipynb)
[![Python](../../assets/python.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.py)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/) | -| **Cambrian-1: Vision-Centric Image Exploration 🔍👀** | Learn how **Cambrian-1** works, using an example of **Vision-Centric** exploration on images found through vector search ! Work on **Flickr-8k** dataset 🔎 | [![Kaggle](https://img.shields.io/badge/Kaggle-035a7d?style=for-the-badge&logo=kaggle&logoColor=white)](https://www.kaggle.com/code/prasantdixit/cambrian-1-vision-centric-exploration-of-images/)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/cambrian-1-vision-centric-exploration/) | - - -[Clip_diffusionDB_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_clip_diffusiondb -[Clip_diffusionDB_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multimodal_clip_diffusiondb/main.ipynb -[Clip_diffusionDB_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_clip_diffusiondb/main.py -[Clip_diffusionDB_ghost]: https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/ - - -[Clip_youtube_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_video_search -[Clip_youtube_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multimodal_video_search/main.ipynb -[Clip_youtube_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_video_search/main.py -[Clip_youtube_ghost]: https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/ \ No newline at end of file diff --git a/docs/src/examples/python_examples/rag.md b/docs/src/examples/python_examples/rag.md deleted file mode 100644 index bed3a1ab..00000000 --- a/docs/src/examples/python_examples/rag.md +++ /dev/null @@ -1,83 +0,0 @@ -**RAG (Retrieval-Augmented Generation) with LanceDB 🔓🧐** -==================================================================== - -Build RAG (Retrieval-Augmented Generation) with LanceDB, a powerful solution for efficient vector-based information retrieval 📊. - -**Experience the Future of Search 🔄** - -🤖 RAG enables AI to **retrieve** relevant information from external sources and use it to **generate** more accurate and context-specific responses. 💻 LanceDB provides a robust framework for integrating LLMs with external knowledge sources 📝. - -| **RAG** | **Description** | **Links** | -|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------| -| **RAG with Matryoshka Embeddings and LlamaIndex** 🪆🔗 | Utilize **Matryoshka embeddings** and **LlamaIndex** to improve the efficiency and accuracy of your RAG models. 📈✨ | [![Github](../../assets/github.svg)][matryoshka_github]
[![Open In Collab](../../assets/colab.svg)][matryoshka_colab] | -| **Improve RAG with Re-ranking** 📈🔄 | Enhance your RAG applications by implementing **re-ranking strategies** for more relevant document retrieval. 📚🔍 | [![Github](../../assets/github.svg)][rag_reranking_github]
[![Open In Collab](../../assets/colab.svg)][rag_reranking_colab]
[![Ghost](../../assets/ghost.svg)][rag_reranking_ghost] | -| **Instruct-Multitask** 🧠🎯 | Integrate the **Instruct Embedding Model** with LanceDB to streamline your embedding API, reducing redundant code and overhead. 🌐📊 | [![Github](../../assets/github.svg)][instruct_multitask_github]
[![Open In Collab](../../assets/colab.svg)][instruct_multitask_colab]
[![Python](../../assets/python.svg)][instruct_multitask_python]
[![Ghost](../../assets/ghost.svg)][instruct_multitask_ghost] | -| **Improve RAG with HyDE** 🌌🔍 | Use **Hypothetical Document Embeddings** for efficient, accurate, and unsupervised dense retrieval. 📄🔍 | [![Github](../../assets/github.svg)][hyde_github]
[![Open In Collab](../../assets/colab.svg)][hyde_colab]
[![Ghost](../../assets/ghost.svg)][hyde_ghost] | -| **Improve RAG with LOTR** 🧙‍♂️📜 | Enhance RAG with **Lord of the Retriever (LOTR)** to address 'Lost in the Middle' challenges, especially in medical data. 🌟📜 | [![Github](../../assets/github.svg)][lotr_github]
[![Open In Collab](../../assets/colab.svg)][lotr_colab]
[![Ghost](../../assets/ghost.svg)][lotr_ghost] | -| **Advanced RAG: Parent Document Retriever** 📑🔗 | Use **Parent Document & Bigger Chunk Retriever** to maintain context and relevance when generating related content. 🎵📄 | [![Github](../../assets/github.svg)][parent_doc_retriever_github]
[![Open In Collab](../../assets/colab.svg)][parent_doc_retriever_colab]
[![Ghost](../../assets/ghost.svg)][parent_doc_retriever_ghost] | -| **Corrective RAG with Langgraph** 🔧📊 | Enhance RAG reliability with **Corrective RAG (CRAG)** by self-reflecting and fact-checking for accurate and trustworthy results. ✅🔍 |[![Github](../../assets/github.svg)][corrective_rag_github]
[![Open In Collab](../../assets/colab.svg)][corrective_rag_colab]
[![Ghost](../../assets/ghost.svg)][corrective_rag_ghost] | -| **Contextual Compression with RAG** 🗜️🧠 | Apply **contextual compression techniques** to condense large documents while retaining essential information. 📄🗜️ | [![Github](../../assets/github.svg)][compression_rag_github]
[![Open In Collab](../../assets/colab.svg)][compression_rag_colab]
[![Ghost](../../assets/ghost.svg)][compression_rag_ghost] | -| **Improve RAG with FLARE** 🔥| Enable users to ask questions directly to **academic papers**, focusing on **ArXiv papers**, with **F**orward-**L**ooking **A**ctive **RE**trieval augmented generation.🚀🌟 | [![Github](../../assets/github.svg)][flare_github]
[![Open In Collab](../../assets/colab.svg)][flare_colab]
[![Ghost](../../assets/ghost.svg)][flare_ghost] | -| **Query Expansion and Reranker** 🔍🔄 | Enhance RAG with query expansion using Large Language Models and advanced **reranking methods** like **Cross Encoders**, **ColBERT v2**, and **FlashRank** for improved document retrieval precision and recall 🔍📈 | [![Github](../../assets/github.svg)][query_github]
[![Open In Collab](../../assets/colab.svg)][query_colab] | -| **RAG Fusion** ⚡🌐 | Build RAG Fusion, utilize the **RRF algorithm** to rerank documents based on user queries ! Use **LanceDB** as vector database to store and retrieve documents related to queries via **OPENAI Embeddings**⚡🌐 | [![Github](../../assets/github.svg)][fusion_github]
[![Open In Collab](../../assets/colab.svg)][fusion_colab] | -| **Agentic RAG** 🤖📚 | Build autonomous information retrieval with **Agentic RAG**, a framework of **intelligent agents** that collaborate to synthesize, summarize, and compare data across sources, that enables proactive and informed decision-making 🤖📚 | [![Github](../../assets/github.svg)][agentic_github]
[![Open In Collab](../../assets/colab.svg)][agentic_colab] | - - - - - - - - - - - - -[matryoshka_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/RAG-with_MatryoshkaEmbed-Llamaindex -[matryoshka_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/RAG-with_MatryoshkaEmbed-Llamaindex/RAG_with_MatryoshkaEmbedding_and_Llamaindex.ipynb - -[rag_reranking_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/RAG_Reranking -[rag_reranking_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/RAG_Reranking/main.ipynb -[rag_reranking_ghost]: https://blog.lancedb.com/simplest-method-to-improve-rag-pipeline-re-ranking-cf6eaec6d544 - - -[instruct_multitask_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/instruct-multitask -[instruct_multitask_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/instruct-multitask/main.ipynb -[instruct_multitask_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/instruct-multitask/main.py -[instruct_multitask_ghost]: https://blog.lancedb.com/multitask-embedding-with-lancedb-be18ec397543 - -[hyde_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Advance-RAG-with-HyDE -[hyde_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Advance-RAG-with-HyDE/main.ipynb -[hyde_ghost]: https://blog.lancedb.com/advanced-rag-precise-zero-shot-dense-retrieval-with-hyde-0946c54dfdcb - -[lotr_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Advance_RAG_LOTR -[lotr_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Advance_RAG_LOTR/main.ipynb -[lotr_ghost]: https://blog.lancedb.com/better-rag-with-lotr-lord-of-retriever-23c8336b9a35 - -[parent_doc_retriever_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/parent_document_retriever -[parent_doc_retriever_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/parent_document_retriever/main.ipynb -[parent_doc_retriever_ghost]: https://blog.lancedb.com/modified-rag-parent-document-bigger-chunk-retriever-62b3d1e79bc6 - -[corrective_rag_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/Corrective-RAG-with_Langgraph -[corrective_rag_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Corrective-RAG-with_Langgraph/CRAG_with_Langgraph.ipynb -[corrective_rag_ghost]: https://blog.lancedb.com/implementing-corrective-rag-in-the-easiest-way-2/ - -[compression_rag_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Contextual-Compression-with-RAG -[compression_rag_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Contextual-Compression-with-RAG/main.ipynb -[compression_rag_ghost]: https://blog.lancedb.com/enhance-rag-integrate-contextual-compression-and-filtering-for-precision-a29d4a810301/ - -[flare_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/better-rag-FLAIR -[flare_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/better-rag-FLAIR/main.ipynb -[flare_ghost]: https://blog.lancedb.com/better-rag-with-active-retrieval-augmented-generation-flare-3b66646e2a9f/ - -[query_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/QueryExpansion%26Reranker -[query_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/QueryExpansion&Reranker/main.ipynb - - -[fusion_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/RAG_Fusion -[fusion_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/RAG_Fusion/main.ipynb - -[agentic_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/Agentic_RAG -[agentic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Agentic_RAG/main.ipynb - - diff --git a/docs/src/examples/python_examples/recommendersystem.md b/docs/src/examples/python_examples/recommendersystem.md deleted file mode 100644 index a7d91143..00000000 --- a/docs/src/examples/python_examples/recommendersystem.md +++ /dev/null @@ -1,37 +0,0 @@ -**Recommender Systems: Personalized Discovery🍿📺** -============================================================== -Deliver personalized experiences with Recommender Systems. 🎁 - -**Technical Overview📜** - -🔍️ LanceDB's powerful vector database capabilities can efficiently store and query item embeddings. Recommender Systems can utilize it and provide personalized recommendations based on user preferences 🤝 and item features 📊 and therefore enhance the user experience.🗂️ - -| **Recommender System** | **Description** | **Links** | -| ---------------------- | --------------- | --------- | -| **Movie Recommender System🎬** | 🤝 Use **collaborative filtering** to predict user preferences, assuming similar users will like similar movies, and leverage **Singular Value Decomposition** (SVD) from Numpy for precise matrix factorization and accurate recommendations📊 | [![Github](../../assets/github.svg)][movie_github]
[![Open In Collab](../../assets/colab.svg)][movie_colab]
[![Python](../../assets/python.svg)][movie_python] | -| **🎥 Movie Recommendation with Genres** | 🔍 Creates movie embeddings using **Doc2Vec**, capturing genre and characteristic nuances, and leverages VectorDB for efficient storage and querying, enabling accurate genre classification and personalized movie recommendations through **similarity searches**🎥 | [![Github](../../assets/github.svg)][genre_github]
[![Open In Collab](../../assets/colab.svg)][genre_colab]
[![Ghost](../../assets/ghost.svg)][genre_ghost] | -| **🛍️ Product Recommender using Collaborative Filtering and LanceDB** | 📈 Using **Collaborative Filtering** and **LanceDB** to analyze your past purchases, recommends products based on user's past purchases. Demonstrated with the Instacart dataset in our example🛒 | [![Github](../../assets/github.svg)][product_github]
[![Open In Collab](../../assets/colab.svg)][product_colab]
[![Python](../../assets/python.svg)][product_python] | -| **🔍 Arxiv Search with OpenCLIP and LanceDB** | 💡 Build a semantic search engine for **Arxiv papers** using **LanceDB**, and benchmarks its performance against traditional keyword-based search on **Nomic's Atlas**, to demonstrate the power of semantic search in finding relevant research papers📚 | [![Github](../../assets/github.svg)][arxiv_github]
[![Open In Collab](../../assets/colab.svg)][arxiv_colab]
[![Python](../../assets/python.svg)][arxiv_python] | -| **Food Recommendation System🍴** | 🍔 Build a food recommendation system with **LanceDB**, featuring vector-based recommendations, full-text search, hybrid search, and reranking model integration for personalized and accurate food suggestions👌 | [![Github](../../assets/github.svg)][food_github]
[![Open In Collab](../../assets/colab.svg)][food_colab] | - -[movie_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommender -[movie_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/movie-recommender/main.ipynb -[movie_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommender/main.py - - -[genre_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/movie-recommendation-with-genres -[genre_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/movie-recommendation-with-genres/movie_recommendation_with_doc2vec_and_lancedb.ipynb -[genre_ghost]: https://blog.lancedb.com/movie-recommendation-system-using-lancedb-and-doc2vec/ - -[product_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/product-recommender -[product_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/product-recommender/main.ipynb -[product_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/product-recommender/main.py - - -[arxiv_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender -[arxiv_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender/main.ipynb -[arxiv_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender/main.py - - -[food_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/Food_recommendation -[food_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/Food_recommendation/main.ipynb diff --git a/docs/src/examples/python_examples/vector_search.md b/docs/src/examples/python_examples/vector_search.md deleted file mode 100644 index 1861e6dc..00000000 --- a/docs/src/examples/python_examples/vector_search.md +++ /dev/null @@ -1,80 +0,0 @@ -**Vector Search: Efficient Retrieval 🔓👀** -==================================================================== - -Vector search with LanceDB, is a solution for efficient and accurate similarity searches in large datasets 📊. - -**Vector Search Capabilities in LanceDB🔝** - -LanceDB implements vector search algorithms for efficient document retrieval and analysis 📊. This enables fast and accurate discovery of relevant documents, leveraging dense vector representations 🤖. The platform supports scalable indexing and querying of high-dimensional vector spaces, facilitating precise document matching and retrieval 📈. - -| **Vector Search** | **Description** | **Links** | -|:-----------------|:---------------|:---------| -| **Inbuilt Hybrid Search 🔄** | Perform hybrid search in **LanceDB** by combining the results of semantic and full-text search via a reranking algorithm of your choice 📊 | [![Github](../../assets/github.svg)][inbuilt_hybrid_search_github]
[![Open In Collab](../../assets/colab.svg)][inbuilt_hybrid_search_colab] | -| **Hybrid Search with BM25 and LanceDB 💡** | Use **Synergizes BM25's** keyword-focused precision (term frequency, document length normalization, bias-free retrieval) with **LanceDB's** semantic understanding (contextual analysis, query intent alignment) for nuanced search results in complex datasets 📈 | [![Github](../../assets/github.svg)][BM25_github]
[![Open In Collab](../../assets/colab.svg)][BM25_colab]
[![Ghost](../../assets/ghost.svg)][BM25_ghost] | -| **NER-powered Semantic Search 🔎** | Extract and identify essential information from text with Named Entity Recognition **(NER)** methods: Dictionary-Based, Rule-Based, and Deep Learning-Based, to accurately extract and categorize entities, enabling precise semantic search results 🗂️ | [![Github](../../assets/github.svg)][NER_github]
[![Open In Collab](../../assets/colab.svg)][NER_colab]
[![Ghost](../../assets/ghost.svg)][NER_ghost]| -| **Audio Similarity Search using Vector Embeddings 🎵** | Create vector **embeddings of audio files** to find similar audio content, enabling efficient audio similarity search and retrieval in **LanceDB's** vector store 📻 |[![Github](../../assets/github.svg)][audio_search_github]
[![Open In Collab](../../assets/colab.svg)][audio_search_colab]
[![Python](../../assets/python.svg)][audio_search_python]| -| **LanceDB Embeddings API: Multi-lingual Semantic Search 🌎** | Build a universal semantic search table with **LanceDB's Embeddings API**, supporting multiple languages (e.g., English, French) using **cohere's** multi-lingual model, for accurate cross-lingual search results 📄 | [![Github](../../assets/github.svg)][mls_github]
[![Open In Collab](../../assets/colab.svg)][mls_colab]
[![Python](../../assets/python.svg)][mls_python] | -| **Facial Recognition: Face Embeddings 🤖** | Detect, crop, and embed faces using Facenet, then store and query face embeddings in **LanceDB** for efficient facial recognition and top-K matching results 👥 | [![Github](../../assets/github.svg)][fr_github]
[![Open In Collab](../../assets/colab.svg)][fr_colab] | -| **Sentiment Analysis: Hotel Reviews 🏨** | Analyze customer sentiments towards the hotel industry using **BERT models**, storing sentiment labels, scores, and embeddings in **LanceDB**, enabling queries on customer opinions and potential areas for improvement 💬 | [![Github](../../assets/github.svg)][sentiment_analysis_github]
[![Open In Collab](../../assets/colab.svg)][sentiment_analysis_colab]
[![Ghost](../../assets/ghost.svg)][sentiment_analysis_ghost] | -| **Vector Arithmetic with LanceDB ⚖️** | Perform **vector arithmetic** on embeddings, enabling complex relationships and nuances in data to be captured, and simplifying the process of retrieving semantically similar results 📊 | [![Github](../../assets/github.svg)][arithmetic_github]
[![Open In Collab](../../assets/colab.svg)][arithmetic_colab]
[![Ghost](../../assets/ghost.svg)][arithmetic_ghost] | -| **Imagebind Demo 🖼️** | Explore the multi-modal capabilities of **Imagebind** through a Gradio app, use **LanceDB API** for seamless image search and retrieval experiences 📸 | [![Github](../../assets/github.svg)][imagebind_github]
[![Open in Spaces](../../assets/open_hf_space.svg)][imagebind_huggingface] | -| **Search Engine using SAM & CLIP 🔍** | Build a search engine within an image using **SAM** and **CLIP** models, enabling object-level search and retrieval, with LanceDB indexing and search capabilities to find the closest match between image embeddings and user queries 📸 | [![Github](../../assets/github.svg)][swi_github]
[![Open In Collab](../../assets/colab.svg)][swi_colab]
[![Ghost](../../assets/ghost.svg)][swi_ghost] | -| **Zero Shot Object Localization and Detection with CLIP 🔎** | Perform object detection on images using **OpenAI's CLIP**, enabling zero-shot localization and detection of objects, with capabilities to split images into patches, parse with CLIP, and plot bounding boxes 📊 | [![Github](../../assets/github.svg)][zsod_github]
[![Open In Collab](../../assets/colab.svg)][zsod_colab] | -| **Accelerate Vector Search with OpenVINO 🚀** | Boost vector search applications using **OpenVINO**, achieving significant speedups with **CLIP** for text-to-image and image-to-image searching, through PyTorch model optimization, FP16 and INT8 format conversion, and quantization with **OpenVINO NNCF** 📈 | [![Github](../../assets/github.svg)][openvino_github]
[![Open In Collab](../../assets/colab.svg)][openvino_colab]
[![Ghost](../../assets/ghost.svg)][openvino_ghost] | -| **Zero-Shot Image Classification with CLIP and LanceDB 📸** | Achieve zero-shot image classification using **CLIP** and **LanceDB**, enabling models to classify images without prior training on specific use cases, unlocking flexible and adaptable image classification capabilities 🔓 | [![Github](../../assets/github.svg)][zsic_github]
[![Open In Collab](../../assets/colab.svg)][zsic_colab]
[![Ghost](../../assets/ghost.svg)][zsic_ghost] | - - - - -[inbuilt_hybrid_search_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Inbuilt-Hybrid-Search -[inbuilt_hybrid_search_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Inbuilt-Hybrid-Search/Inbuilt_Hybrid_Search_with_LanceDB.ipynb - -[BM25_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Hybrid_search_bm25_lancedb -[BM25_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Hybrid_search_bm25_lancedb/main.ipynb -[BM25_ghost]: https://blog.lancedb.com/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6 - -[NER_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/NER-powered-Semantic-Search -[NER_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/NER-powered-Semantic-Search/NER_powered_Semantic_Search_with_LanceDB.ipynb -[NER_ghost]: https://blog.lancedb.com/ner-powered-semantic-search-using-lancedb-51051dc3e493 - -[audio_search_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/audio_search -[audio_search_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/audio_search/main.ipynb -[audio_search_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/archived_examples/audio_search/main.py - -[mls_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/multi-lingual-wiki-qa -[mls_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/multi-lingual-wiki-qa/main.ipynb -[mls_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/archived_examples/multi-lingual-wiki-qa/main.py - -[fr_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/facial_recognition -[fr_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/facial_recognition/main.ipynb - -[sentiment_analysis_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Sentiment-Analysis-Analyse-Hotel-Reviews -[sentiment_analysis_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Sentiment-Analysis-Analyse-Hotel-Reviews/Sentiment_Analysis_using_LanceDB.ipynb -[sentiment_analysis_ghost]: https://blog.lancedb.com/sentiment-analysis-using-lancedb-2da3cb1e3fa6 - -[arithmetic_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Vector-Arithmetic-with-LanceDB -[arithmetic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Vector-Arithmetic-with-LanceDB/main.ipynb -[arithmetic_ghost]: https://blog.lancedb.com/vector-arithmetic-with-lancedb-an-intro-to-vector-embeddings/ - -[imagebind_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/imagebind_demo -[imagebind_huggingface]: https://huggingface.co/spaces/raghavd99/imagebind2 - -[swi_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/search-within-images-with-sam-and-clip -[swi_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/search-within-images-with-sam-and-clip/main.ipynb -[swi_ghost]: https://blog.lancedb.com/search-within-an-image-331b54e4285e - -[zsod_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/zero-shot-object-detection-CLIP -[zsod_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/zero-shot-object-detection-CLIP/zero_shot_object_detection_clip.ipynb - -[openvino_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Accelerate-Vector-Search-Applications-Using-OpenVINO -[openvino_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb -[openvino_ghost]: https://blog.lancedb.com/accelerate-vector-search-applications-using-openvino-lancedb/ - -[zsic_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/zero-shot-image-classification -[zsic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/zero-shot-image-classification/main.ipynb -[zsic_ghost]: https://blog.lancedb.com/zero-shot-image-classification-with-vector-search/ - - - - - diff --git a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md b/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md deleted file mode 100644 index a63b4bae..00000000 --- a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md +++ /dev/null @@ -1,106 +0,0 @@ -# Serverless LanceDB - -## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily. - -s3-lambda - -This is a great option if you're wanting to scale with your use case and save effort and costs of maintenance. - -Let's walk through how to get a simple Lambda function that queries the SIFT dataset on S3. - -Before we start, you'll need to ensure you create a secure account access to AWS. We recommend using user policies, as this way AWS can share credentials securely without you having to pass around environment variables into Lambda. - -We'll also use a container to ship our Lambda code. This is a good option for Lambda as you don't have the space limits that you would otherwise by building a package yourself. - -# Initial setup: creating a LanceDB Table and storing it remotely on S3 - -We'll use the SIFT vector dataset as an example. To make it easier, we've already made a Lance-format SIFT dataset publicly available, which we can access and use to populate our LanceDB Table. - -To do this, download the Lance files locally first from: - -``` -s3://eto-public/datasets/sift/vec_data.lance -``` - -Then, we can write a quick Python script to populate our LanceDB Table: - -```python -import lance -sift_dataset = lance.dataset("/path/to/local/vec_data.lance") -df = sift_dataset.to_table().to_pandas() - -import lancedb -db = lancedb.connect(".") -table = db.create_table("vector_example", df) -``` - -Once we've created our Table, we are free to move this data over to S3 so we can remotely host it. - -# Building our Lambda app: a simple event handler for vector search - -Now that we've got a remotely hosted LanceDB Table, we'll want to be able to query it from Lambda. To do so, let's create a new `Dockerfile` using the AWS python container base: - -```docker -FROM public.ecr.aws/lambda/python:3.10 - -RUN pip3 install --upgrade pip -RUN pip3 install --no-cache-dir -U numpy --target "${LAMBDA_TASK_ROOT}" -RUN pip3 install --no-cache-dir -U lancedb --target "${LAMBDA_TASK_ROOT}" - -COPY app.py ${LAMBDA_TASK_ROOT} - -CMD [ "app.handler" ] -``` - -Now let's make a simple Lambda function that queries the SIFT dataset in `app.py`. - -```python -import json -import numpy as np -import lancedb - -db = lancedb.connect("s3://eto-public/tables") -table = db.open_table("vector_example") - -def handler(event, context): - status_code = 200 - - if event['query_vector'] is None: - status_code = 404 - return { - "statusCode": status_code, - "headers": { - "Content-Type": "application/json" - }, - "body": json.dumps({ - "Error ": "No vector to query was issued" - }) - } - - # Shape of SIFT is (128,1M), d=float32 - query_vector = np.array(event['query_vector'], dtype=np.float32) - - rs = table.search(query_vector).limit(2).to_list() - - return { - "statusCode": status_code, - "headers": { - "Content-Type": "application/json" - }, - "body": json.dumps(rs) - } -``` - -# Deploying the container to ECR - -The next step is to build and push the container to ECR, where it can then be used to create a new Lambda function. - -It's best to follow the official AWS documentation for how to do this, which you can view here: - -``` -https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-upload -``` - -# Final step: setting up your Lambda function - -Once the container is pushed, you can create a Lambda function by selecting the container. diff --git a/docs/src/examples/serverless_qa_bot_with_modal_and_langchain.md b/docs/src/examples/serverless_qa_bot_with_modal_and_langchain.md deleted file mode 100644 index 82c7396b..00000000 --- a/docs/src/examples/serverless_qa_bot_with_modal_and_langchain.md +++ /dev/null @@ -1,166 +0,0 @@ -# Serverless QA Bot with Modal and LangChain - -## use LanceDB's LangChain integration with Modal to run a serverless app - -modal - -We're going to build a QA bot for your documentation using LanceDB's LangChain integration and use Modal for deployment. - -Modal is an end-to-end compute platform for model inference, batch jobs, task queues, web apps and more. It's a great way to deploy your LanceDB models and apps. - -To get started, ensure that you have created an account and logged into [Modal](https://modal.com/). To follow along, the full source code is available on Github [here](https://github.com/lancedb/lancedb/blob/main/docs/src/examples/modal_langchain.py). - -### Setting up Modal - -We'll start by specifying our dependencies and creating a new Modal `Stub`: - -```python -lancedb_image = Image.debian_slim().pip_install( - "lancedb", - "langchain", - "openai", - "pandas", - "tiktoken", - "unstructured", - "tabulate" -) - -stub = Stub( - name="example-langchain-lancedb", - image=lancedb_image, - secrets=[Secret.from_name("my-openai-secret")], -) -``` - -We're using Modal's Secrets injection to secure our OpenAI key. To set your own, you can access the Modal UI and enter your key. - -### Setting up caches for LanceDB and LangChain - -Next, we can setup some globals to cache our LanceDB database, as well as our LangChain docsource: - -```python -docsearch = None -docs_path = Path("docs.pkl") -db_path = Path("lancedb") -``` - -### Downloading our dataset - -We're going use a pregenerated dataset, which stores HTML files of the Pandas 2.0 documentation. -You could switch this out for your own dataset. - -```python -def download_docs(): - pandas_docs = requests.get("https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip") - with open(Path("pandas.documentation.zip"), "wb") as f: - f.write(pandas_docs.content) - - file = zipfile.ZipFile(Path("pandas.documentation.zip")) - file.extractall(path=Path("pandas_docs")) -``` - -### Pre-processing the dataset and generating metadata - -Once we've downloaded it, we want to parse and pre-process them using LangChain, and then vectorize them and store it in LanceDB. -Let's first create a function that uses LangChains `UnstructuredHTMLLoader` to parse them. -We can then add our own metadata to it and store it alongside the data, we'll later be able to use this for filtering metadata. - -```python -def store_docs(): - docs = [] - - if not docs_path.exists(): - for p in Path("pandas_docs/pandas.documentation").rglob("*.html"): - if p.is_dir(): - continue - loader = UnstructuredHTMLLoader(p) - raw_document = loader.load() - - m = {} - m["title"] = get_document_title(raw_document[0]) - m["version"] = "2.0rc0" - raw_document[0].metadata = raw_document[0].metadata | m - raw_document[0].metadata["source"] = str(raw_document[0].metadata["source"]) - docs = docs + raw_document - - with docs_path.open("wb") as fh: - pickle.dump(docs, fh) - else: - with docs_path.open("rb") as fh: - docs = pickle.load(fh) - - return docs -``` - -### Simple LangChain chain for a QA bot - -Now we can create a simple LangChain chain for our QA bot. We'll use the `RecursiveCharacterTextSplitter` to split our documents into chunks, and then use the `OpenAIEmbeddings` to vectorize them. - -Lastly, we'll create a LanceDB table and store the vectorized documents in it, then create a `RetrievalQA` model from the chain and return it. - -```python -def qanda_langchain(query): - download_docs() - docs = store_docs() - - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=200, - ) - documents = text_splitter.split_documents(docs) - embeddings = OpenAIEmbeddings() - - db = lancedb.connect(db_path) - table = db.create_table("pandas_docs", data=[ - {"vector": embeddings.embed_query("Hello World"), "text": "Hello World", "id": "1"} - ], mode="overwrite") - docsearch = LanceDB.from_documents(documents, embeddings, connection=table) - qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever()) - return qa.run(query) -``` - -### Creating our Modal entry points - -Now we can create our Modal entry points for our CLI and web endpoint: - -```python -@stub.function() -@web_endpoint(method="GET") -def web(query: str): - answer = qanda_langchain(query) - return { - "answer": answer, - } - -@stub.function() -def cli(query: str): - answer = qanda_langchain(query) - print(answer) -``` - -# Testing it out! - -Testing the CLI: - -```bash -modal run modal_langchain.py --query "What are the major differences in pandas 2.0?" -``` - -Testing the web endpoint: - -```bash -modal serve modal_langchain.py -``` - -In the CLI, Modal will provide you a web endpoint. Copy this endpoint URI for the next step. -Once this is served, then we can hit it with `curl`. - -Note, the first time this runs, it will take a few minutes to download the dataset and vectorize it. -An actual production example would pre-cache/load the dataset and vectorized documents prior - -```bash -curl --get --data-urlencode "query=What are the major differences in pandas 2.0?" https://your-modal-endpoint-app.modal.run - -{"answer":" The major differences in pandas 2.0 include the ability to use any numpy numeric dtype in a Index, installing optional dependencies with pip extras, and enhancements, bug fixes, and performance improvements."} -``` - diff --git a/docs/src/examples/serverless_website_chatbot.md b/docs/src/examples/serverless_website_chatbot.md deleted file mode 100644 index 7e763818..00000000 --- a/docs/src/examples/serverless_website_chatbot.md +++ /dev/null @@ -1,61 +0,0 @@ -# LanceDB Chatbot - Vercel Next.js Template -Use an AI chatbot with website context retrieved from a vector store like LanceDB. LanceDB is lightweight and can be embedded directly into Next.js, with data stored on-prem. - -## One click deploy on Vercel -[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https%3A%2F%2Fgithub.com%2Flancedb%2Flancedb-vercel-chatbot&env=OPENAI_API_KEY&envDescription=OpenAI%20API%20Key%20for%20chat%20completion.&project-name=lancedb-vercel-chatbot&repository-name=lancedb-vercel-chatbot&demo-title=LanceDB%20Chatbot%20Demo&demo-description=Demo%20website%20chatbot%20with%20LanceDB.&demo-url=https%3A%2F%2Flancedb.vercel.app&demo-image=https%3A%2F%2Fi.imgur.com%2FazVJtvr.png) - -![Demo website landing page](../assets/vercel-template.gif) - -## Development - -First, rename `.env.example` to `.env.local`, and fill out `OPENAI_API_KEY` with your OpenAI API key. You can get one [here](https://openai.com/blog/openai-api). - -Run the development server: - -```bash -npm run dev -# or -yarn dev -# or -pnpm dev -``` - -Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. - -This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font. - -## Learn More - -To learn more about LanceDB or Next.js, take a look at the following resources: - -- [LanceDB Documentation](https://lancedb.github.io/lancedb/) - learn about LanceDB, the developer-friendly serverless vector database. -- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. -- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. - -## LanceDB on Next.js and Vercel - -FYI: these configurations have been pre-implemented in this template. - -Since LanceDB contains a prebuilt Node binary, you must configure `next.config.js` to exclude it from webpack. This is required for both using Next.js and deploying on Vercel. -```js -/** @type {import('next').NextConfig} */ -module.exports = ({ - webpack(config) { - config.externals.push({ vectordb: 'vectordb' }) - return config; - } -}) -``` - -To deploy on Vercel, we need to make sure that the NodeJS runtime static file analysis for Vercel can find the binary, since LanceDB uses dynamic imports by default. We can do this by modifying `package.json` in the `scripts` section. -```json -{ - ... - "scripts": { - ... - "vercel-build": "sed -i 's/nativeLib = require(`@lancedb\\/vectordb-\\${currentTarget()}`);/nativeLib = require(`@lancedb\\/vectordb-linux-x64-gnu`);/' node_modules/vectordb/native.js && next build", - ... - }, - ... -} -``` diff --git a/docs/src/examples/transformerjs_embedding_search_nodejs.md b/docs/src/examples/transformerjs_embedding_search_nodejs.md deleted file mode 100644 index e018008f..00000000 --- a/docs/src/examples/transformerjs_embedding_search_nodejs.md +++ /dev/null @@ -1,121 +0,0 @@ -# Vector embedding search using TransformersJS - -## Embed and query data from LanceDB using TransformersJS - -transformersjs - -This example shows how to use the [transformers.js](https://github.com/xenova/transformers.js) library to perform vector embedding search using LanceDB's Javascript API. - - -### Setting up -First, install the dependencies: -```bash -npm install vectordb -npm i @xenova/transformers -``` - -We will also be using the [all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) model to make it compatible with Transformers.js - -Within our `index.js` file we will import the necessary libraries and define our model and database: - -```javascript -const lancedb = require('vectordb') -const { pipeline } = await import('@xenova/transformers') -const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); -``` - -### Creating the embedding function - -Next, we will create a function that will take in a string and return the vector embedding of that string. We will use the `pipe` function we defined earlier to get the vector embedding of the string. - -```javascript -// Define the function. `sourceColumn` is required for LanceDB to know -// which column to use as input. -const embed_fun = {} -embed_fun.sourceColumn = 'text' -embed_fun.embed = async function (batch) { - let result = [] - // Given a batch of strings, we will use the `pipe` function to get - // the vector embedding of each string. - for (let text of batch) { - // 'mean' pooling and normalizing allows the embeddings to share the - // same length. - const res = await pipe(text, { pooling: 'mean', normalize: true }) - result.push(Array.from(res['data'])) - } - return (result) -} -``` - -### Creating the database - -Now, we will create the LanceDB database and add the embedding function we defined earlier. - -```javascript -// Link a folder and create a table with data -const db = await lancedb.connect('data/sample-lancedb') - -// You can also import any other data, but make sure that you have a column -// for the embedding function to use. -const data = [ - { id: 1, text: 'Cherry', type: 'fruit' }, - { id: 2, text: 'Carrot', type: 'vegetable' }, - { id: 3, text: 'Potato', type: 'vegetable' }, - { id: 4, text: 'Apple', type: 'fruit' }, - { id: 5, text: 'Banana', type: 'fruit' } -] - -// Create the table with the embedding function -const table = await db.createTable('food_table', data, "create", embed_fun) -``` - -### Performing the search - -Now, we can perform the search using the `search` function. LanceDB automatically uses the embedding function we defined earlier to get the vector embedding of the query string. - -```javascript -// Query the table -const results = await table - .search("a sweet fruit to eat") - .metricType("cosine") - .limit(2) - .execute() -console.log(results.map(r => r.text)) -``` -```bash -[ 'Banana', 'Cherry' ] -``` - -Output of `results`: -```bash -[ - { - vector: Float32Array(384) [ - -0.057455405592918396, - 0.03617725893855095, - -0.0367760956287384, - ... 381 more items - ], - id: 5, - text: 'Banana', - type: 'fruit', - _distance: 0.4919965863227844 - }, - { - vector: Float32Array(384) [ - 0.0009714411571621895, - 0.008223623037338257, - 0.009571489877998829, - ... 381 more items - ], - id: 1, - text: 'Cherry', - type: 'fruit', - _distance: 0.5540297031402588 - } -] -``` - -### Wrapping it up - -In this example, we showed how to use the `transformers.js` library to perform vector embedding search using LanceDB's Javascript API. You can find the full code for this example on [Github](https://github.com/lancedb/lancedb/blob/main/node/examples/js-transformers/index.js)! diff --git a/docs/src/examples/youtube_transcript_bot.md b/docs/src/examples/youtube_transcript_bot.md deleted file mode 100644 index f000359a..00000000 --- a/docs/src/examples/youtube_transcript_bot.md +++ /dev/null @@ -1,13 +0,0 @@ -# YouTube transcript search - -## Search through youtube transcripts using natural language with LanceDB - -youtube transcript search - - -Open In Colab - -Scripts - [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipesexamples/youtube_bot/main.py) [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](https://github.com/lancedb/vectordb-recipes/examples/youtube_bot/index.js) - - -This example is in a [notebook](https://github.com/lancedb/lancedb/blob/main/docs/src/notebooks/youtube_transcript_search.ipynb) diff --git a/docs/src/examples/youtube_transcript_bot_with_nodejs.md b/docs/src/examples/youtube_transcript_bot_with_nodejs.md deleted file mode 100644 index d85c29eb..00000000 --- a/docs/src/examples/youtube_transcript_bot_with_nodejs.md +++ /dev/null @@ -1,139 +0,0 @@ -# YouTube transcript QA bot with NodeJS - -## use LanceDB's Javascript API and OpenAI to build a QA bot for YouTube transcripts - -nodejs - -This Q&A bot will allow you to search through youtube transcripts using natural language! We'll introduce how to use LanceDB's Javascript API to store and manage your data easily. - -```bash -npm install vectordb -``` - -## Download the data - -For this example, we're using a sample of a HuggingFace dataset that contains YouTube transcriptions: `jamescalam/youtube-transcriptions`. Download and extract this file under the `data` folder: - -```bash -wget -c https://eto-public.s3.us-west-2.amazonaws.com/datasets/youtube_transcript/youtube-transcriptions_sample.jsonl -``` - -## Prepare Context - -Each item in the dataset contains just a short chunk of text. We'll need to merge a bunch of these chunks together on a rolling basis. For this demo, we'll look back 20 records to create a more complete context for each sentence. - -First, we need to read and parse the input file. - -```javascript -const lines = (await fs.readFile(INPUT_FILE_NAME, 'utf-8')) - .toString() - .split('\n') - .filter(line => line.length > 0) - .map(line => JSON.parse(line)) - -const data = contextualize(lines, 20, 'video_id') -``` - -The contextualize function groups the transcripts by video_id and then creates the expanded context for each item. - -```javascript -function contextualize (rows, contextSize, groupColumn) { - const grouped = [] - rows.forEach(row => { - if (!grouped[row[groupColumn]]) { - grouped[row[groupColumn]] = [] - } - grouped[row[groupColumn]].push(row) - }) - - const data = [] - Object.keys(grouped).forEach(key => { - for (let i = 0; i < grouped[key].length; i++) { - const start = i - contextSize > 0 ? i - contextSize : 0 - grouped[key][i].context = grouped[key].slice(start, i + 1).map(r => r.text).join(' ') - } - data.push(...grouped[key]) - }) - return data -} -``` - -## Create the LanceDB Table - -To load our data into LanceDB, we need to create embedding (vectors) for each item. For this example, we will use the OpenAI embedding functions, which have a native integration with LanceDB. - -```javascript -// You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable -const apiKey = process.env.OPENAI_API_KEY -// The embedding function will create embeddings for the 'context' column -const embedFunction = new lancedb.OpenAIEmbeddingFunction('context', apiKey) -// Connects to LanceDB -const db = await lancedb.connect('data/youtube-lancedb') -const tbl = await db.createTable('vectors', data, embedFunction) -``` - -## Create and answer the prompt - -We will accept questions in natural language and use our corpus stored in LanceDB to answer them. First, we need to set up the OpenAI client: - -```javascript -const configuration = new Configuration({ apiKey }) -const openai = new OpenAIApi(configuration) -``` - -Then we can prompt questions and use LanceDB to retrieve the three most relevant transcripts for this prompt. - -```javascript -const query = await rl.question('Prompt: ') -const results = await tbl - .search(query) - .select(['title', 'text', 'context']) - .limit(3) - .execute() -``` - -The query and the transcripts' context are appended together in a single prompt: - -```javascript -function createPrompt (query, context) { - let prompt = - 'Answer the question based on the context below.\n\n' + - 'Context:\n' - - // need to make sure our prompt is not larger than max size - prompt = prompt + context.map(c => c.context).join('\n\n---\n\n').substring(0, 3750) - prompt = prompt + `\n\nQuestion: ${query}\nAnswer:` - return prompt -} -``` - -We can now use the OpenAI Completion API to process our custom prompt and give us an answer. - -```javascript -const response = await openai.createCompletion({ - model: 'text-davinci-003', - prompt: createPrompt(query, results), - max_tokens: 400, - temperature: 0, - top_p: 1, - frequency_penalty: 0, - presence_penalty: 0 -}) -console.log(response.data.choices[0].text) -``` - -## Let's put it all together now - -Now we can provide queries and have them answered based on your local LanceDB data. - -```bash -Prompt: who was the 12th person on the moon and when did they land? - The 12th person on the moon was Harrison Schmitt and he landed on December 11, 1972. -Prompt: Which training method should I use for sentence transformers when I only have pairs of related sentences? - NLI with multiple negative ranking loss. -``` - -## That's a wrap - -In this example, you learned how to use LanceDB to store and query embedding representations of your local data. The complete example code is on [GitHub](https://github.com/lancedb/lancedb/tree/main/node/examples), and you can also download the LanceDB dataset using [this link](https://eto-public.s3.us-west-2.amazonaws.com/datasets/youtube_transcript/youtube-lancedb.zip). - diff --git a/docs/src/extra_js/init_ask_ai_widget.js b/docs/src/extra_js/init_ask_ai_widget.js deleted file mode 100644 index 02e7ce37..00000000 --- a/docs/src/extra_js/init_ask_ai_widget.js +++ /dev/null @@ -1,79 +0,0 @@ -// Creates an SVG robot icon (from Lucide) -function robotSVG() { - var svg = document.createElementNS("http://www.w3.org/2000/svg", "svg"); - svg.setAttribute("width", "24"); - svg.setAttribute("height", "24"); - svg.setAttribute("viewBox", "0 0 24 24"); - svg.setAttribute("fill", "none"); - svg.setAttribute("stroke", "currentColor"); - svg.setAttribute("stroke-width", "2"); - svg.setAttribute("stroke-linecap", "round"); - svg.setAttribute("stroke-linejoin", "round"); - svg.setAttribute("class", "lucide lucide-bot-message-square"); - - var path1 = document.createElementNS("http://www.w3.org/2000/svg", "path"); - path1.setAttribute("d", "M12 6V2H8"); - svg.appendChild(path1); - - var path2 = document.createElementNS("http://www.w3.org/2000/svg", "path"); - path2.setAttribute("d", "m8 18-4 4V8a2 2 0 0 1 2-2h12a2 2 0 0 1 2 2v8a2 2 0 0 1-2 2Z"); - svg.appendChild(path2); - - var path3 = document.createElementNS("http://www.w3.org/2000/svg", "path"); - path3.setAttribute("d", "M2 12h2"); - svg.appendChild(path3); - - var path4 = document.createElementNS("http://www.w3.org/2000/svg", "path"); - path4.setAttribute("d", "M9 11v2"); - svg.appendChild(path4); - - var path5 = document.createElementNS("http://www.w3.org/2000/svg", "path"); - path5.setAttribute("d", "M15 11v2"); - svg.appendChild(path5); - - var path6 = document.createElementNS("http://www.w3.org/2000/svg", "path"); - path6.setAttribute("d", "M20 12h2"); - svg.appendChild(path6); - - return svg -} - -// Creates the Fluidic Chatbot buttom -function fluidicButton() { - var btn = document.createElement("a"); - btn.href = "https://asklancedb.com"; - btn.target = "_blank"; - btn.style.position = "fixed"; - btn.style.fontWeight = "bold"; - btn.style.fontSize = ".8rem"; - btn.style.right = "10px"; - btn.style.bottom = "10px"; - btn.style.width = "80px"; - btn.style.height = "80px"; - btn.style.background = "linear-gradient(135deg, #7C5EFF 0%, #625eff 100%)"; - btn.style.color = "white"; - btn.style.borderRadius = "5px"; - btn.style.display = "flex"; - btn.style.flexDirection = "column"; - btn.style.justifyContent = "center"; - btn.style.alignItems = "center"; - btn.style.zIndex = "1000"; - btn.style.opacity = "0"; - btn.style.boxShadow = "0 0 0 rgba(0, 0, 0, 0)"; - btn.style.transition = "opacity 0.2s ease-in, box-shadow 0.2s ease-in"; - - setTimeout(function() { - btn.style.opacity = "1"; - btn.style.boxShadow = "0 0 .2rem #0000001a,0 .2rem .4rem #0003" - }, 0); - - return btn -} - -document.addEventListener("DOMContentLoaded", function() { - var btn = fluidicButton() - btn.appendChild(robotSVG()); - var text = document.createTextNode("Ask AI"); - btn.appendChild(text); - document.body.appendChild(btn); -}); diff --git a/docs/src/faq.md b/docs/src/faq.md deleted file mode 100644 index 4eb2583f..00000000 --- a/docs/src/faq.md +++ /dev/null @@ -1,87 +0,0 @@ -This section covers some common questions and issues that you may encounter when using LanceDB. - -### Is LanceDB open source? - -Yes, LanceDB is an open source vector database available under an Apache 2.0 license. We also have a serverless SaaS solution, LanceDB Cloud, available under a commercial license. - -### What is the difference between Lance and LanceDB? - -[Lance](https://github.com/lancedb/lance) is a modern columnar data format for AI, written in Rust 🦀. It’s perfect for building search engines, feature stores and being the foundation of large-scale ML training jobs requiring high performance IO and shuffles. It also has native support for storing, querying, and inspecting deeply nested data for robotics or large blobs like images, point clouds, and more. - -LanceDB is the vector database that’s built on top of Lance, and utilizes the underlying optimized storage format to build efficient disk-based indexes that power semantic search & retrieval applications, from RAGs to QA Bots to recommender systems. - -### Why invent another data format instead of using Parquet? - -As we mention in our talk titled “[Lance, a modern columnar data format](https://www.youtube.com/watch?v=ixpbVyrsuL8)”, Parquet and other tabular formats that derive from it are rather dated (Parquet is over 10 years old), especially when it comes to random access on vectors. We needed a format that’s able to handle the complex trade-offs involved in shuffling, scanning, OLAP and filtering large datasets involving vectors, and our extensive experiments with Parquet didn't yield sufficient levels of performance for modern ML. [Our benchmarks](https://blog.lancedb.com/benchmarking-random-access-in-lance-ed690757a826) show that Lance is up to 1000x faster than Parquet for random access, which we believe justifies our decision to create a new data format for AI. - -### Why build in Rust? 🦀 - -We believe that the Rust ecosystem has attained mainstream maturity and that Rust will form the underpinnings of large parts of the data and ML landscape in a few years. Performance, latency and reliability are paramount to a vector DB, and building in Rust allows us to iterate and release updates more rapidly due to Rust’s safety guarantees. Both Lance (the data format) and LanceDB (the database) are written entirely in Rust. We also provide Python, JavaScript, and Rust client libraries to interact with the database. - -### What is the difference between LanceDB OSS and LanceDB Cloud? - -LanceDB OSS is an **embedded** (in-process) solution that can be used as the vector store of choice for your LLM and RAG applications. It can be embedded inside an existing application backend, or used in-process alongside existing ML and data engineering pipelines. - -LanceDB Cloud is a **serverless** solution — the database and data sit on the cloud and we manage the scalability of the application side via a remote client, without the need to manage any infrastructure. - -Both flavors of LanceDB benefit from the blazing fast Lance data format and are built on the same open source foundations. - -### What makes LanceDB different? - -LanceDB is among the few embedded vector DBs out there that we believe can unlock a whole new class of LLM-powered applications in the browser or via edge functions. Lance’s multi-modal nature allows you to store the raw data, metadata and the embeddings all at once, unlike other solutions that typically store just the embeddings and metadata. - -The Lance data format that powers our storage system also provides true zero-copy access and seamless interoperability with numerous other data formats (like Pandas, Polars, Pydantic) via Apache Arrow, as well as automatic data versioning and data management without needing extra infrastructure. - -### How large of a dataset can LanceDB handle? - -LanceDB and its underlying data format, Lance, are built to scale to really large amounts of data (hundreds of terabytes). We are currently working with customers who regularly perform operations on 200M+ vectors, and we’re fast approaching billion scale and beyond, which are well-handled by our disk-based indexes, without you having to break the bank. - -### Do I need to build an ANN index to run vector search? - -No. LanceDB is blazing fast (due to its disk-based index) for even brute force kNN search, within reason. In our benchmarks, computing 100K pairs of 1000-dimension vectors takes less than 20ms. For small datasets of ~100K records or applications that can accept ~100ms latency, an ANN index is usually not necessary. - -For large-scale (>1M) or higher dimension vectors, it is beneficial to create an ANN index. See the [ANN indexes](ann_indexes.md) section for more details. - -### Does LanceDB support full-text search? - -Yes, LanceDB supports full-text search (FTS) via [Tantivy](https://github.com/quickwit-oss/tantivy). Our current FTS integration is Python-only, and our goal is to push it down to the Rust level in future versions to enable much more powerful search capabilities available to our Python, JavaScript and Rust clients. Follow along in the [Github issue](https://github.com/lancedb/lance/issues/1195) - -### How can I speed up data inserts? - -It's highly recommend to perform bulk inserts via batches (for e.g., Pandas DataFrames or lists of dicts in Python) to speed up inserts for large datasets. Inserting records one at a time is slow and can result in suboptimal performance because each insert creates a new data fragment on disk. Batching inserts allows LanceDB to create larger fragments (and their associated manifests), which are more efficient to read and write. - -### Do I need to set a refine factor when using an index? - -Yes. LanceDB uses PQ, or Product Quantization, to compress vectors and speed up search when using an ANN index. However, because PQ is a lossy compression algorithm, it tends to reduce recall while also reducing the index size. To address this trade-off, we introduce a process called **refinement**. The normal process computes distances by operating on the compressed PQ vectors. The refinement factor (*rf*) is a multiplier that takes the top-k similar PQ vectors to a given query, fetches `rf * k` *full* vectors and computes the raw vector distances between them and the query vector, reordering the top-k results based on these scores instead. - -For example, if you're retrieving the top 10 results and set `refine_factor` to 25, LanceDB will fetch the 250 most similar vectors (according to PQ), compute the distances again based on the full vectors for those 250 and then re-rank based on their scores. This can significantly improve recall, with a small added latency cost (typically a few milliseconds), so it's recommended you set a `refine_factor` of anywhere between 5-50 and measure its impact on latency prior to deploying your solution. - -### How can I improve IVF-PQ recall while keeping latency low? - -When using an IVF-PQ index, there's a trade-off between recall and latency at query time. You can improve recall by increasing the number of probes and the `refine_factor`. In our benchmark on the GIST-1M dataset, we show that it's possible to achieve >0.95 recall with a latency of under 10 ms on most systems, using ~50 probes and a `refine_factor` of 50. This is, of course, subject to the dataset at hand and a quick sensitivity study can be performed on your own data. You can find more details on the benchmark in our [blog post](https://blog.lancedb.com/benchmarking-lancedb-92b01032874a). - -![](assets/recall-vs-latency.webp) - -### How do I connect to MinIO? - -MinIO supports an S3 compatible API. In order to connect to a MinIO instance, you need to: - -- Set the envvar `AWS_ENDPOINT` to the URL of your MinIO API -- Set the envvars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` with your MinIO credential -- Call `lancedb.connect("s3://minio_bucket_name")` - -### Where can I find benchmarks for LanceDB? - -Refer to this [post](https://blog.lancedb.com/benchmarking-lancedb-92b01032874a) for recent benchmarks. - -### How much data can LanceDB practically manage without effecting performance? - -We target good performance on ~10-50 billion rows and ~10-30 TB of data. - -### Does LanceDB support concurrent operations? - -LanceDB can handle concurrent reads very well, and can scale horizontally. The main constraint is how well the [storage layer](https://lancedb.github.io/lancedb/concepts/storage/) you've chosen scales. For writes, we support concurrent writing, though too many concurrent writers can lead to failing writes as there is a limited number of times a writer retries a commit - -!!! info "Multiprocessing with LanceDB" - - For multiprocessing you should probably not use ```fork``` as lance is multi-threaded internally and ```fork``` and multi-thread do not work well.[Refer to this discussion](https://discuss.python.org/t/concerns-regarding-deprecation-of-fork-with-alive-threads/33555) diff --git a/docs/src/fts.md b/docs/src/fts.md deleted file mode 100644 index cfc719d6..00000000 --- a/docs/src/fts.md +++ /dev/null @@ -1,258 +0,0 @@ -# Full-text search (Native FTS) - -LanceDB provides support for full-text search via Lance, allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions. - -!!! note - The Python SDK uses tantivy-based FTS by default, need to pass `use_tantivy=False` to use native FTS. - -## Example - -Consider that we have a LanceDB table named `my_table`, whose string column `text` we want to index and query via keyword search, the FTS index must be created before you can search via keywords. - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:import-lancedb" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb-fts" - --8<-- "python/python/tests/docs/test_search.py:basic_fts" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:import-lancedb" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb-fts" - --8<-- "python/python/tests/docs/test_search.py:basic_fts_async" - ``` - -=== "TypeScript" - - ```typescript - import * as lancedb from "@lancedb/lancedb"; - const uri = "data/sample-lancedb" - const db = await lancedb.connect(uri); - - const data = [ - { vector: [3.1, 4.1], text: "Frodo was a happy puppy" }, - { vector: [5.9, 26.5], text: "There are several kittens playing" }, - ]; - const tbl = await db.createTable("my_table", data, { mode: "overwrite" }); - await tbl.createIndex("text", { - config: lancedb.Index.fts(), - }); - - await tbl - .search("puppy", "fts") - .select(["text"]) - .limit(10) - .toArray(); - ``` - -=== "Rust" - - ```rust - let uri = "data/sample-lancedb"; - let db = connect(uri).execute().await?; - let initial_data: Box = create_some_records()?; - let tbl = db - .create_table("my_table", initial_data) - .execute() - .await?; - tbl - .create_index(&["text"], Index::FTS(FtsIndexBuilder::default())) - .execute() - .await?; - - tbl - .query() - .full_text_search(FullTextSearchQuery::new("puppy".to_owned())) - .select(lancedb::query::Select::Columns(vec!["text".to_owned()])) - .limit(10) - .execute() - .await?; - ``` - -It would search on all indexed columns by default, so it's useful when there are multiple indexed columns. - -Passing `fts_columns="text"` if you want to specify the columns to search. - -!!! note - LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead. - -## Tokenization -By default the text is tokenized by splitting on punctuation and whitespaces, and would filter out words that are with length greater than 40, and lowercase all words. - -Stemming is useful for improving search results by reducing words to their root form, e.g. "running" to "run". LanceDB supports stemming for multiple languages, you can specify the tokenizer name to enable stemming by the pattern `tokenizer_name="{language_code}_stem"`, e.g. `en_stem` for English. - -For example, to enable stemming for English: -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_config_stem" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_config_stem_async" - ``` - -the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported. - -The tokenizer is customizable, you can specify how the tokenizer splits the text, and how it filters out words, etc. - -For example, for language with accents, you can specify the tokenizer to use `ascii_folding` to remove accents, e.g. 'é' to 'e': -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_config_folding" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_config_folding_async" - ``` - -## Filtering - -LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported. - -This can be invoked via the familiar `where` syntax. - -With pre-filtering: -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_prefiltering" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_prefiltering_async" - ``` - -=== "TypeScript" - - ```typescript - await tbl - .search("puppy") - .select(["id", "doc"]) - .limit(10) - .where("meta='foo'") - .prefilter(true) - .toArray(); - ``` - -=== "Rust" - - ```rust - table - .query() - .full_text_search(FullTextSearchQuery::new("puppy".to_owned())) - .select(lancedb::query::Select::Columns(vec!["doc".to_owned()])) - .limit(10) - .only_if("meta='foo'") - .execute() - .await?; - ``` - -With post-filtering: -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_postfiltering" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_postfiltering_async" - ``` - -=== "TypeScript" - - ```typescript - await tbl - .search("apple") - .select(["id", "doc"]) - .limit(10) - .where("meta='foo'") - .prefilter(false) - .toArray(); - ``` - -=== "Rust" - - ```rust - table - .query() - .full_text_search(FullTextSearchQuery::new(words[0].to_owned())) - .select(lancedb::query::Select::Columns(vec!["doc".to_owned()])) - .postfilter() - .limit(10) - .only_if("meta='foo'") - .execute() - .await?; - ``` - -## Phrase queries vs. terms queries - -!!! warning "Warn" - Lance-based FTS doesn't support queries using boolean operators `OR`, `AND`. - -For full-text search you can specify either a **phrase** query like `"the old man and the sea"`, -or a **terms** search query like `old man sea`. For more details on the terms -query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html). - -To search for a phrase, the index must be created with `with_position=True`: -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_with_position" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_with_position_async" - ``` -This will allow you to search for phrases, but it will also significantly increase the index size and indexing time. - - -## Incremental indexing - -LanceDB supports incremental indexing, which means you can add new records to the table without reindexing the entire table. - -This can make the query more efficient, especially when the table is large and the new records are relatively small. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_incremental_index" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:fts_incremental_index_async" - ``` - -=== "TypeScript" - - ```typescript - await tbl.add([{ vector: [3.1, 4.1], text: "Frodo was a happy puppy" }]); - await tbl.optimize(); - ``` - -=== "Rust" - - ```rust - let more_data: Box = create_some_records()?; - tbl.add(more_data).execute().await?; - tbl.optimize(OptimizeAction::All).execute().await?; - ``` -!!! note - - New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed. \ No newline at end of file diff --git a/docs/src/fts_tantivy.md b/docs/src/fts_tantivy.md deleted file mode 100644 index 1c65db30..00000000 --- a/docs/src/fts_tantivy.md +++ /dev/null @@ -1,160 +0,0 @@ -# Full-text search (Tantivy-based FTS) - -LanceDB also provides support for full-text search via [Tantivy](https://github.com/quickwit-oss/tantivy), allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions. - -The tantivy-based FTS is only available in Python synchronous APIs and does not support building indexes on object storage or incremental indexing. If you need these features, try native FTS [native FTS](fts.md). - -## Installation - -To use full-text search, install the dependency [`tantivy-py`](https://github.com/quickwit-oss/tantivy-py): - -```sh -# Say you want to use tantivy==0.20.1 -pip install tantivy==0.20.1 -``` - -## Example - -Consider that we have a LanceDB table named `my_table`, whose string column `content` we want to index and query via keyword search, the FTS index must be created before you can search via keywords. - -```python -import lancedb - -uri = "data/sample-lancedb" -db = lancedb.connect(uri) - -table = db.create_table( - "my_table", - data=[ - {"id": 1, "vector": [3.1, 4.1], "title": "happy puppy", "content": "Frodo was a happy puppy", "meta": "foo"}, - {"id": 2, "vector": [5.9, 26.5], "title": "playing kittens", "content": "There are several kittens playing around the puppy", "meta": "bar"}, - ], -) - -# passing `use_tantivy=False` to use lance FTS index -# `use_tantivy=True` by default -table.create_fts_index("content", use_tantivy=True) -table.search("puppy").limit(10).select(["content"]).to_list() -# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}] -# ... -``` - -It would search on all indexed columns by default, so it's useful when there are multiple indexed columns. - -!!! note - LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead. - -## Tokenization -By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem". - -```python -table.create_fts_index("content", use_tantivy=True, tokenizer_name="en_stem", replace=True) -``` - -the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported. - -## Index multiple columns - -If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`: - -```python -table.create_fts_index(["title", "content"], use_tantivy=True, replace=True) -``` - -Note that the search API call does not change - you can search over all indexed columns at once. - -## Filtering - -Currently the LanceDB full text search feature supports *post-filtering*, meaning filters are -applied on top of the full text search results (see [native FTS](fts.md) if you need pre-filtering). This can be invoked via the familiar -`where` syntax: - -```python -table.search("puppy").limit(10).where("meta='foo'").to_list() -``` - -## Sorting - -You can pre-sort the documents by specifying `ordering_field_names` when -creating the full-text search index. Once pre-sorted, you can then specify -`ordering_field_name` while searching to return results sorted by the given -field. For example, - -```python -table.create_fts_index(["content"], use_tantivy=True, ordering_field_names=["id"], replace=True) - -(table.search("puppy", ordering_field_name="id") - .limit(20) - .to_list()) -``` - -!!! note - If you wish to specify an ordering field at query time, you must also - have specified it during indexing time. Otherwise at query time, an - error will be raised that looks like `ValueError: The field does not exist: xxx` - -!!! note - The fields to sort on must be of typed unsigned integer, or else you will see - an error during indexing that looks like - `TypeError: argument 'value': 'float' object cannot be interpreted as an integer`. - -!!! note - You can specify multiple fields for ordering at indexing time. - But at query time only one ordering field is supported. - - -## Phrase queries vs. terms queries - -For full-text search you can specify either a **phrase** query like `"the old man and the sea"`, -or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms -query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html). - -!!! tip "Note" - The query parser will raise an exception on queries that are ambiguous. For example, in the query `they could have been dogs OR cats`, `OR` is capitalized so it's considered a keyword query operator. But it's ambiguous how the left part should be treated. So if you submit this search query as is, you'll get `Syntax Error: they could have been dogs OR cats`. - - ```py - # This raises a syntax error - table.search("they could have been dogs OR cats") - ``` - - On the other hand, lowercasing `OR` to `or` will work, because there are no capitalized logical operators and - the query is treated as a phrase query. - - ```py - # This works! - table.search("they could have been dogs or cats") - ``` - -It can be cumbersome to have to remember what will cause a syntax error depending on the type of -query you want to perform. To make this simpler, when you want to perform a phrase query, you can -enforce it in one of two ways: - -1. Place the double-quoted query inside single quotes. For example, `table.search('"they could have been dogs OR cats"')` is treated as -a phrase query. -1. Explicitly declare the `phrase_query()` method. This is useful when you have a phrase query that -itself contains double quotes. For example, `table.search('the cats OR dogs were not really "pets" at all').phrase_query()` -is treated as a phrase query. - -In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested -double quotes replaced by single quotes. - - -## Configurations - -By default, LanceDB configures a 1GB heap size limit for creating the index. You can -reduce this if running on a smaller node, or increase this for faster performance while -indexing a larger corpus. - -```python -# configure a 512MB heap size -heap = 1024 * 1024 * 512 -table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=heap, replace=True) -``` - -## Current limitations - -1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed. - -2. We currently only support local filesystem paths for the FTS index. - This is a tantivy limitation. We've implemented an object store plugin - but there's no way in tantivy-py to specify to use it. diff --git a/docs/src/guides/multi-vector.md b/docs/src/guides/multi-vector.md deleted file mode 100644 index b51f5dc3..00000000 --- a/docs/src/guides/multi-vector.md +++ /dev/null @@ -1,85 +0,0 @@ -# Late interaction & MultiVector embedding type -Late interaction is a technique used in retrieval that calculates the relevance of a query to a document by comparing their multi-vector representations. The key difference between late interaction and other popular methods: - -![late interaction vs other methods](https://raw.githubusercontent.com/lancedb/assets/b035a0ceb2c237734e0d393054c146d289792339/docs/assets/integration/colbert-blog-interaction.svg) - - -[ Illustration from https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/] - -No interaction: Refers to independently embedding the query and document, that are compared to calcualte similarity without any interaction between them. This is typically used in vector search operations. - -Partial interaction Refers to a specific approach where the similarity computation happens primarily between query vectors and document vectors, without extensive interaction between individual components of each. An example of this is dual-encoder models like BERT. - -Early full interaction Refers to techniques like cross-encoders that process query and docs in pairs with full interaction across various stages of encoding. This is a powerful, but relatively slower technique. Because it requires processing query and docs in pairs, doc embeddings can't be pre-computed for fast retrieval. This is why cross encoders are typically used as reranking models combined with vector search. Learn more about [LanceDB Reranking support](https://lancedb.github.io/lancedb/reranking/). - -Late interaction Late interaction is a technique that calculates the doc and query similarity independently and then the interaction or evaluation happens during the retrieval process. This is typically used in retrieval models like ColBERT. Unlike early interaction, It allows speeding up the retrieval process without compromising the depth of semantic analysis. - -## Internals of ColBERT -Let's take a look at the steps involved in performing late interaction based retrieval using ColBERT: - -• ColBERT employs BERT-based encoders for both queries `(fQ)` and documents `(fD)` -• A single BERT model is shared between query and document encoders and special tokens distinguish input types: `[Q]` for queries and `[D]` for documents - -**Query Encoder (fQ):** -• Query q is tokenized into WordPiece tokens: `q1, q2, ..., ql`. `[Q]` token is prepended right after BERT's `[CLS]` token -• If query length < Nq, it's padded with [MASK] tokens up to Nq. -• The padded sequence goes through BERT's transformer architecture -• Final embeddings are L2-normalized. - -**Document Encoder (fD):** -• Document d is tokenized into tokens `d1, d2, ..., dm`. `[D]` token is prepended after `[CLS]` token -• Unlike queries, documents are NOT padded with `[MASK]` tokens -• Document tokens are processed through BERT and the same linear layer - -**Late Interaction:** -• Late interaction estimates relevance score `S(q,d)` using embedding `Eq` and `Ed`. Late interaction happens after independent encoding -• For each query embedding, maximum similarity is computed against all document embeddings -• The similarity measure can be cosine similarity or squared L2 distance - -**MaxSim Calculation:** -``` -S(q,d) := Σ max(Eqi⋅EdjT) - i∈|Eq| j∈|Ed| -``` -• This finds the best matching document embedding for each query embedding -• Captures relevance based on strongest local matches between contextual embeddings - -## LanceDB MultiVector type -LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali). - -You can index on a column with multivector type and search on it, the query can be single vector or multiple vectors. For now, only cosine metric is supported for multivector search. The vector value type can be float16, float32 or float64. LanceDB integrateds [ConteXtualized Token Retriever(XTR)](https://arxiv.org/abs/2304.01982), which introduces a simple, yet novel, objective function that encourages the model to retrieve the most important document tokens first. - -```python -import lancedb -import numpy as np -import pyarrow as pa - -db = lancedb.connect("data/multivector_demo") -schema = pa.schema( - [ - pa.field("id", pa.int64()), - # float16, float32, and float64 are supported - pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))), - ] -) -data = [ - { - "id": i, - "vector": np.random.random(size=(2, 256)).tolist(), - } - for i in range(1024) -] -tbl = db.create_table("my_table", data=data, schema=schema) - -# only cosine similarity is supported for multi-vectors -tbl.create_index(metric="cosine") - -# query with single vector -query = np.random.random(256).astype(np.float16) -tbl.search(query).to_arrow() - -# query with multiple vectors -query = np.random.random(size=(2, 256)) -tbl.search(query).to_arrow() -``` -Find more about vector search in LanceDB [here](https://lancedb.github.io/lancedb/search/#multivector-type). diff --git a/docs/src/guides/scalar_index.md b/docs/src/guides/scalar_index.md deleted file mode 100644 index 97835394..00000000 --- a/docs/src/guides/scalar_index.md +++ /dev/null @@ -1,156 +0,0 @@ -# Building a Scalar Index - -Scalar indices organize data by scalar attributes (e.g. numbers, categorical values), enabling fast filtering of vector data. In vector databases, scalar indices accelerate the retrieval of scalar data associated with vectors, thus enhancing the query performance when searching for vectors that meet certain scalar criteria. - -Similar to many SQL databases, LanceDB supports several types of scalar indices to accelerate search -over scalar columns. - -- `BTREE`: The most common type is BTREE. The index stores a copy of the - column in sorted order. This sorted copy allows a binary search to be used to - satisfy queries. -- `BITMAP`: this index stores a bitmap for each unique value in the column. It - uses a series of bits to indicate whether a value is present in a row of a table -- `LABEL_LIST`: a special index that can be used on `List` columns to - support queries with `array_contains_all` and `array_contains_any` - using an underlying bitmap index. - For example, a column that contains lists of tags (e.g. `["tag1", "tag2", "tag3"]`) can be indexed with a `LABEL_LIST` index. - -!!! tips "How to choose the right scalar index type" - - `BTREE`: This index is good for scalar columns with mostly distinct values and does best when the query is highly selective. - - `BITMAP`: This index works best for low-cardinality numeric or string columns, where the number of unique values is small (i.e., less than a few thousands). - - `LABEL_LIST`: This index should be used for columns containing list-type data. - -| Data Type | Filter | Index Type | -| --------------------------------------------------------------- | ----------------------------------------- | ------------ | -| Numeric, String, Temporal | `<`, `=`, `>`, `in`, `between`, `is null` | `BTREE` | -| Boolean, numbers or strings with fewer than 1,000 unique values | `<`, `=`, `>`, `in`, `between`, `is null` | `BITMAP` | -| List of low cardinality of numbers or strings | `array_has_any`, `array_has_all` | `LABEL_LIST` | - -### Create a scalar index -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb-btree-bitmap" - --8<-- "python/python/tests/docs/test_guide_index.py:basic_scalar_index" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb-btree-bitmap" - --8<-- "python/python/tests/docs/test_guide_index.py:basic_scalar_index_async" - ``` - -=== "Typescript" - - === "@lancedb/lancedb" - - ```js - const db = await lancedb.connect("data"); - const tbl = await db.openTable("my_vectors"); - - await tbl.create_index("book_id"); - await tlb.create_index("publisher", { config: lancedb.Index.bitmap() }) - ``` - -The following scan will be faster if the column `book_id` has a scalar index: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:search_with_scalar_index" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:search_with_scalar_index_async" - ``` - -=== "Typescript" - - === "@lancedb/lancedb" - - ```js - const db = await lancedb.connect("data"); - const tbl = await db.openTable("books"); - - await tbl - .query() - .where("book_id = 2") - .limit(10) - .toArray(); - ``` - -Scalar indices can also speed up scans containing a vector search or full text search, and a prefilter: - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_with_scalar_index" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_index.py:vector_search_with_scalar_index_async" - ``` - -=== "Typescript" - - === "@lancedb/lancedb" - - ```js - const db = await lancedb.connect("data/lance"); - const tbl = await db.openTable("book_with_embeddings"); - - await tbl.search(Array(1536).fill(1.2)) - .where("book_id != 3") // prefilter is default behavior. - .limit(10) - .toArray(); - ``` -### Update a scalar index -Updating the table data (adding, deleting, or modifying records) requires that you also update the scalar index. This can be done by calling `optimize`, which will trigger an update to the existing scalar index. -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:update_scalar_index" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_index.py:update_scalar_index_async" - ``` - -=== "TypeScript" - - ```typescript - await tbl.add([{ vector: [7, 8], book_id: 4 }]); - await tbl.optimize(); - ``` - -=== "Rust" - - ```rust - let more_data: Box = create_some_records()?; - tbl.add(more_data).execute().await?; - tbl.optimize(OptimizeAction::All).execute().await?; - ``` - -!!! note - - New data added after creating the scalar index will still appear in search results if optimize is not used, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates the optimize process, minimizing the impact on search speed. \ No newline at end of file diff --git a/docs/src/guides/sql_querying.md b/docs/src/guides/sql_querying.md deleted file mode 100644 index 30ca2ffe..00000000 --- a/docs/src/guides/sql_querying.md +++ /dev/null @@ -1,60 +0,0 @@ -# SQL Querying - -You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL. -This guide will show how to query Lance tables them using both. - -We will re-use the dataset [created previously](./tables.md): - -```python -import lancedb - -db = lancedb.connect("data/sample-lancedb") -data = [ - {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, - {"vector": [5.9, 26.5], "item": "bar", "price": 20.0} -] -table = db.create_table("pd_table", data=data) -``` - -## Querying a LanceDB Table with DuckDb - -The `to_lance` method converts the LanceDB table to a `LanceDataset`, which is accessible to DuckDB through the Arrow compatibility layer. -To query the resulting Lance dataset in DuckDB, all you need to do is reference the dataset by the same name in your SQL query. - -```python -import duckdb - -arrow_table = table.to_lance() - -duckdb.query("SELECT * FROM arrow_table") -``` - -| vector | item | price | -| ----------- | ---- | ----- | -| [3.1, 4.1] | foo | 10.0 | -| [5.9, 26.5] | bar | 20.0 | - -## Querying a LanceDB Table with Apache Datafusion - -Have the required imports before doing any querying. - -=== "Python" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-ffi-dataset" - ``` - -Register the table created with the Datafusion session context. - -=== "Python" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic" - ``` - -| vector | item | price | -| ----------- | ---- | ----- | -| [3.1, 4.1] | foo | 10.0 | -| [5.9, 26.5] | bar | 20.0 | diff --git a/docs/src/guides/storage.md b/docs/src/guides/storage.md deleted file mode 100644 index e4f73721..00000000 --- a/docs/src/guides/storage.md +++ /dev/null @@ -1,701 +0,0 @@ -# Configuring cloud storage - - - -When using LanceDB OSS, you can choose where to store your data. The tradeoffs between different storage options are discussed in the [storage concepts guide](../concepts/storage.md). This guide shows how to configure LanceDB to use different storage options. - -## Object Stores - -LanceDB OSS supports object stores such as AWS S3 (and compatible stores), Azure Blob Store, and Google Cloud Storage. Which object store to use is determined by the URI scheme of the dataset path. `s3://` is used for AWS S3, `az://` is used for Azure Blob Storage, and `gs://` is used for Google Cloud Storage. These URIs are passed to the `connect` function: - -=== "Python" - - AWS S3: - === "Sync API" - - ```python - import lancedb - db = lancedb.connect("s3://bucket/path") - ``` - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async("s3://bucket/path") - ``` - - Google Cloud Storage: - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect("gs://bucket/path") - ``` - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async("gs://bucket/path") - ``` - - Azure Blob Storage: - - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect("az://bucket/path") - ``` - - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async("az://bucket/path") - ``` - Note that for Azure, storage credentials must be configured. See [below](#azure-blob-storage) for more details. - - -=== "TypeScript" - - === "@lancedb/lancedb" - - AWS S3: - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect("s3://bucket/path"); - ``` - - Google Cloud Storage: - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect("gs://bucket/path"); - ``` - - Azure Blob Storage: - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect("az://bucket/path"); - ``` - - - === "vectordb (deprecated)" - - AWS S3: - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect("s3://bucket/path"); - ``` - - Google Cloud Storage: - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect("gs://bucket/path"); - ``` - - Azure Blob Storage: - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect("az://bucket/path"); - ``` - -In most cases, when running in the respective cloud and permissions are set up correctly, no additional configuration is required. When running outside of the respective cloud, authentication credentials must be provided. Credentials and other configuration options can be set in two ways: first, by setting environment variables. And second, by passing a `storage_options` object to the `connect` function. For example, to increase the request timeout to 60 seconds, you can set the `TIMEOUT` environment variable to `60s`: - -```bash -export TIMEOUT=60s -``` - -If you only want this to apply to one particular connection, you can pass the `storage_options` argument when opening the connection: - -=== "Python" - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect( - "s3://bucket/path", - storage_options={"timeout": "60s"} - ) - ``` - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async( - "s3://bucket/path", - storage_options={"timeout": "60s"} - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - - const db = await lancedb.connect("s3://bucket/path", { - storageOptions: {timeout: "60s"} - }); - ``` - - === "vectordb (deprecated)" - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect("s3://bucket/path", { - storageOptions: {timeout: "60s"} - }); - ``` - -Getting even more specific, you can set the `timeout` for only a particular table: - -=== "Python" - - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect("s3://bucket/path") - table = db.create_table( - "table", - [{"a": 1, "b": 2}], - storage_options={"timeout": "60s"} - ) - ``` - - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async("s3://bucket/path") - async_table = await async_db.create_table( - "table", - [{"a": 1, "b": 2}], - storage_options={"timeout": "60s"} - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect("s3://bucket/path"); - const table = db.createTable( - "table", - [{ a: 1, b: 2}], - {storageOptions: {timeout: "60s"}} - ); - ``` - - === "vectordb (deprecated)" - - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect("s3://bucket/path"); - const table = db.createTable( - "table", - [{ a: 1, b: 2}], - {storageOptions: {timeout: "60s"}} - ); - ``` - -!!! info "Storage option casing" - - The storage option keys are case-insensitive. So `connect_timeout` and `CONNECT_TIMEOUT` are the same setting. Usually lowercase is used in the `storage_options` argument and uppercase is used for environment variables. In the `lancedb` Node package, the keys can also be provided in `camelCase` capitalization. For example, `connectTimeout` is equivalent to `connect_timeout`. - -### General configuration - -There are several options that can be set for all object stores, mostly related to network client configuration. - - - -| Key | Description | -|----------------------------|--------------------------------------------------------------------------------------------------| -| `allow_http` | Allow non-TLS, i.e. non-HTTPS connections. Default: `False`. | -| `allow_invalid_certificates`| Skip certificate validation on HTTPS connections. Default: `False`. | -| `connect_timeout` | Timeout for only the connect phase of a Client. Default: `5s`. | -| `timeout` | Timeout for the entire request, from connection until the response body has finished. Default: `30s`. | -| `user_agent` | User agent string to use in requests. | -| `proxy_url` | URL of a proxy server to use for requests. Default: `None`. | -| `proxy_ca_certificate` | PEM-formatted CA certificate for proxy connections. | -| `proxy_excludes` | List of hosts that bypass the proxy. This is a comma-separated list of domains and IP masks. Any subdomain of the provided domain will be bypassed. For example, `example.com, 192.168.1.0/24` would bypass `https://api.example.com`, `https://www.example.com`, and any IP in the range `192.168.1.0/24`. | - -### AWS S3 - -To configure credentials for AWS S3, you can use the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` keys. Region can also be set, but it is not mandatory when using AWS. -These can be set as environment variables or passed in the `storage_options` parameter: - -=== "Python" - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect( - "s3://bucket/path", - storage_options={ - "aws_access_key_id": "my-access-key", - "aws_secret_access_key": "my-secret-key", - "aws_session_token": "my-session-token", - } - ) - ``` - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async( - "s3://bucket/path", - storage_options={ - "aws_access_key_id": "my-access-key", - "aws_secret_access_key": "my-secret-key", - "aws_session_token": "my-session-token", - } - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect( - "s3://bucket/path", - { - storageOptions: { - awsAccessKeyId: "my-access-key", - awsSecretAccessKey: "my-secret-key", - awsSessionToken: "my-session-token", - } - } - ); - ``` - - === "vectordb (deprecated)" - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "s3://bucket/path", - { - storageOptions: { - awsAccessKeyId: "my-access-key", - awsSecretAccessKey: "my-secret-key", - awsSessionToken: "my-session-token", - } - } - ); - ``` - -Alternatively, if you are using AWS SSO, you can use the `AWS_PROFILE` and `AWS_DEFAULT_REGION` environment variables. - -The following keys can be used as both environment variables or keys in the `storage_options` parameter: - -| Key | Description | -|------------------------------------|------------------------------------------------------------------------------------------------------| -| `aws_region` / `region` | The AWS region the bucket is in. This can be automatically detected when using AWS S3, but must be specified for S3-compatible stores. | -| `aws_access_key_id` / `access_key_id` | The AWS access key ID to use. | -| `aws_secret_access_key` / `secret_access_key` | The AWS secret access key to use. | -| `aws_session_token` / `session_token` | The AWS session token to use. | -| `aws_endpoint` / `endpoint` | The endpoint to use for S3-compatible stores. | -| `aws_virtual_hosted_style_request` / `virtual_hosted_style_request` | Whether to use virtual hosted-style requests, where the bucket name is part of the endpoint. Meant to be used with `aws_endpoint`. Default: `False`. | -| `aws_s3_express` / `s3_express` | Whether to use S3 Express One Zone endpoints. Default: `False`. See more details below. | -| `aws_server_side_encryption` | The server-side encryption algorithm to use. Must be one of `"AES256"`, `"aws:kms"`, or `"aws:kms:dsse"`. Default: `None`. | -| `aws_sse_kms_key_id` | The KMS key ID to use for server-side encryption. If set, `aws_server_side_encryption` must be `"aws:kms"` or `"aws:kms:dsse"`. | -| `aws_sse_bucket_key_enabled` | Whether to use bucket keys for server-side encryption. | - -!!! tip "Automatic cleanup for failed writes" - - LanceDB uses [multi-part uploads](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html) when writing data to S3 in order to maximize write speed. LanceDB will abort these uploads when it shuts down gracefully, such as when cancelled by keyboard interrupt. However, in the rare case that LanceDB crashes, it is possible that some data will be left lingering in your account. To cleanup this data, we recommend (as AWS themselves do) that you setup a lifecycle rule to delete in-progress uploads after 7 days. See the AWS guide: - - **[Configuring a bucket lifecycle configuration to delete incomplete multipart uploads](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpu-abort-incomplete-mpu-lifecycle-config.html)** - -#### AWS IAM Permissions - -If a bucket is private, then an IAM policy must be specified to allow access to it. For many development scenarios, using broad permissions such as a PowerUser account is more than sufficient for working with LanceDB. However, in many production scenarios, you may wish to have as narrow as possible permissions. - -For **read and write access**, LanceDB will need a policy such as: - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:PutObject", - "s3:GetObject", - "s3:DeleteObject" - ], - "Resource": "arn:aws:s3::://*" - }, - { - "Effect": "Allow", - "Action": [ - "s3:ListBucket", - "s3:GetBucketLocation" - ], - "Resource": "arn:aws:s3:::", - "Condition": { - "StringLike": { - "s3:prefix": [ - "/*" - ] - } - } - } - ] -} -``` - -For **read-only access**, LanceDB will need a policy such as: - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:GetObject" - ], - "Resource": "arn:aws:s3::://*" - }, - { - "Effect": "Allow", - "Action": [ - "s3:ListBucket", - "s3:GetBucketLocation" - ], - "Resource": "arn:aws:s3:::", - "Condition": { - "StringLike": { - "s3:prefix": [ - "/*" - ] - } - } - } - ] -} -``` - - -#### S3-compatible stores - -LanceDB can also connect to S3-compatible stores, such as MinIO. To do so, you must specify both region and endpoint: - -=== "Python" - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect( - "s3://bucket/path", - storage_options={ - "region": "us-east-1", - "endpoint": "http://minio:9000", - } - ) - ``` - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async( - "s3://bucket/path", - storage_options={ - "region": "us-east-1", - "endpoint": "http://minio:9000", - } - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect( - "s3://bucket/path", - { - storageOptions: { - region: "us-east-1", - endpoint: "http://minio:9000", - } - } - ); - ``` - - === "vectordb (deprecated)" - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "s3://bucket/path", - { - storageOptions: { - region: "us-east-1", - endpoint: "http://minio:9000", - } - } - ); - ``` - -This can also be done with the ``AWS_ENDPOINT`` and ``AWS_DEFAULT_REGION`` environment variables. - -!!! tip "Local servers" - - For local development, the server often has a `http` endpoint rather than a - secure `https` endpoint. In this case, you must also set the `ALLOW_HTTP` - environment variable to `true` to allow non-TLS connections, or pass the - storage option `allow_http` as `true`. If you do not do this, you will get - an error like `URL scheme is not allowed`. - -#### S3 Express - -LanceDB supports [S3 Express One Zone](https://aws.amazon.com/s3/storage-classes/express-one-zone/) endpoints, but requires additional infrastructure configuration for the compute service, such as EC2 or Lambda. Please refer to [Networking requirements for S3 Express One Zone](https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-networking.html). - -To configure LanceDB to use an S3 Express endpoint, you must set the storage option `s3_express`. The bucket name in your table URI should **include the suffix**. - -=== "Python" - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect( - "s3://my-bucket--use1-az4--x-s3/path", - storage_options={ - "region": "us-east-1", - "s3_express": "true", - } - ) - ``` - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async( - "s3://my-bucket--use1-az4--x-s3/path", - storage_options={ - "region": "us-east-1", - "s3_express": "true", - } - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect( - "s3://my-bucket--use1-az4--x-s3/path", - { - storageOptions: { - region: "us-east-1", - s3Express: "true", - } - } - ); - ``` - - === "vectordb (deprecated)" - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "s3://my-bucket--use1-az4--x-s3/path", - { - storageOptions: { - region: "us-east-1", - s3Express: "true", - } - } - ); - ``` - -### Google Cloud Storage - -GCS credentials are configured by setting the `GOOGLE_SERVICE_ACCOUNT` environment variable to the path of a JSON file containing the service account credentials. Alternatively, you can pass the path to the JSON file in the `storage_options`: - -=== "Python" - - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect( - "gs://my-bucket/my-database", - storage_options={ - "service_account": "path/to/service-account.json", - } - ) - ``` - - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async( - "gs://my-bucket/my-database", - storage_options={ - "service_account": "path/to/service-account.json", - } - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect( - "gs://my-bucket/my-database", - { - storageOptions: { - serviceAccount: "path/to/service-account.json", - } - } - ); - ``` - - === "vectordb (deprecated)" - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "gs://my-bucket/my-database", - { - storageOptions: { - serviceAccount: "path/to/service-account.json", - } - } - ); - ``` - -!!! info "HTTP/2 support" - - By default, GCS uses HTTP/1 for communication, as opposed to HTTP/2. This improves maximum throughput significantly. However, if you wish to use HTTP/2 for some reason, you can set the environment variable `HTTP1_ONLY` to `false`. - -The following keys can be used as both environment variables or keys in the `storage_options` parameter: - - -| Key | Description | -|---------------------------------------|----------------------------------------------| -| ``google_service_account`` / `service_account` | Path to the service account JSON file. | -| ``google_service_account_key`` | The serialized service account key. | -| ``google_application_credentials`` | Path to the application credentials. | - -### Azure Blob Storage - -Azure Blob Storage credentials can be configured by setting the `AZURE_STORAGE_ACCOUNT_NAME`and `AZURE_STORAGE_ACCOUNT_KEY` environment variables. Alternatively, you can pass the account name and key in the `storage_options` parameter: - -=== "Python" - - - === "Sync API" - - ```python - import lancedb - db = lancedb.connect( - "az://my-container/my-database", - storage_options={ - account_name: "some-account", - account_key: "some-key", - } - ) - ``` - - === "Async API" - - ```python - import lancedb - async_db = await lancedb.connect_async( - "az://my-container/my-database", - storage_options={ - account_name: "some-account", - account_key: "some-key", - } - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - const db = await lancedb.connect( - "az://my-container/my-database", - { - storageOptions: { - accountName: "some-account", - accountKey: "some-key", - } - } - ); - ``` - - === "vectordb (deprecated)" - - ```ts - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "az://my-container/my-database", - { - storageOptions: { - accountName: "some-account", - accountKey: "some-key", - } - } - ); - ``` - -These keys can be used as both environment variables or keys in the `storage_options` parameter: - - - -| Key | Description | -|---------------------------------------|--------------------------------------------------------------------------------------------------| -| ``azure_storage_account_name`` | The name of the azure storage account. | -| ``azure_storage_account_key`` | The serialized service account key. | -| ``azure_client_id`` | Service principal client id for authorizing requests. | -| ``azure_client_secret`` | Service principal client secret for authorizing requests. | -| ``azure_tenant_id`` | Tenant id used in oauth flows. | -| ``azure_storage_sas_key`` | Shared access signature. The signature is expected to be percent-encoded, much like they are provided in the azure storage explorer or azure portal. | -| ``azure_storage_token`` | Bearer token. | -| ``azure_storage_use_emulator`` | Use object store with azurite storage emulator. | -| ``azure_endpoint`` | Override the endpoint used to communicate with blob storage. | -| ``azure_use_fabric_endpoint`` | Use object store with url scheme account.dfs.fabric.microsoft.com. | -| ``azure_msi_endpoint`` | Endpoint to request a imds managed identity token. | -| ``azure_object_id`` | Object id for use with managed identity authentication. | -| ``azure_msi_resource_id`` | Msi resource id for use with managed identity authentication. | -| ``azure_federated_token_file`` | File containing token for Azure AD workload identity federation. | -| ``azure_use_azure_cli`` | Use azure cli for acquiring access token. | -| ``azure_disable_tagging`` | Disables tagging objects. This can be desirable if not supported by the backing store. | - - diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md deleted file mode 100644 index 26e7ca9d..00000000 --- a/docs/src/guides/tables.md +++ /dev/null @@ -1,1087 +0,0 @@ - -Open In Colab
- -A Table is a collection of Records in a LanceDB Database. Tables in Lance have a schema that defines the columns and their types. These schemas can include nested columns and can evolve over time. - -This guide will show how to create tables, insert data into them, and update the data. - - -## Creating a LanceDB Table - -Initialize a LanceDB connection and create a table - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:connect" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:connect_async" - ``` - - LanceDB allows ingesting data from various sources - `dict`, `list[dict]`, `pd.DataFrame`, `pa.Table` or a `Iterator[pa.RecordBatch]`. Let's take a look at some of the these. - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - import * as lancedb from "@lancedb/lancedb"; - import * as arrow from "apache-arrow"; - - const uri = "data/sample-lancedb"; - const db = await lancedb.connect(uri); - ``` - - === "vectordb (deprecated)" - - ```typescript - const lancedb = require("vectordb"); - const arrow = require("apache-arrow"); - - const uri = "data/sample-lancedb"; - const db = await lancedb.connect(uri); - ``` - - - -### From list of tuples or dictionaries - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async" - ``` - - !!! info "Note" - If the table already exists, LanceDB will raise an error by default. - - `create_table` supports an optional `exist_ok` parameter. When set to True - and the table exists, then it simply opens the existing table. The data you - passed in will NOT be appended to the table in that case. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_exist_ok" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_exist_ok" - ``` - - Sometimes you want to make sure that you start fresh. If you want to - overwrite the table, you can pass in mode="overwrite" to the createTable function. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_overwrite" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_overwrite" - ``` - -=== "Typescript[^1]" - You can create a LanceDB table in JavaScript using an array of records as follows. - - === "@lancedb/lancedb" - - - ```ts - --8<-- "nodejs/examples/basic.test.ts:create_table" - ``` - - This will infer the schema from the provided data. If you want to explicitly provide a schema, you can use `apache-arrow` to declare a schema - - ```ts - --8<-- "nodejs/examples/basic.test.ts:create_table_with_schema" - ``` - - !!! info "Note" - `createTable` supports an optional `existsOk` parameter. When set to true - and the table exists, then it simply opens the existing table. The data you - passed in will NOT be appended to the table in that case. - - ```ts - --8<-- "nodejs/examples/basic.test.ts:create_table_exists_ok" - ``` - - Sometimes you want to make sure that you start fresh. If you want to - overwrite the table, you can pass in mode: "overwrite" to the createTable function. - - ```ts - --8<-- "nodejs/examples/basic.test.ts:create_table_overwrite" - ``` - - === "vectordb (deprecated)" - - ```ts - --8<-- "docs/src/basic_legacy.ts:create_table" - ``` - - This will infer the schema from the provided data. If you want to explicitly provide a schema, you can use apache-arrow to declare a schema - - - - ```ts - --8<-- "docs/src/basic_legacy.ts:create_table_with_schema" - ``` - - !!! warning - `existsOk` is not available in `vectordb` - - - - If the table already exists, vectordb will raise an error by default. - You can use `writeMode: WriteMode.Overwrite` to overwrite the table. - But this will delete the existing table and create a new one with the same name. - - - Sometimes you want to make sure that you start fresh. - - If you want to overwrite the table, you can pass in `writeMode: lancedb.WriteMode.Overwrite` to the createTable function. - - ```ts - const table = await con.createTable(tableName, data, { - writeMode: WriteMode.Overwrite - }) - ``` - -### From a Pandas DataFrame - - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pandas" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_from_pandas" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pandas" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_from_pandas" - ``` - -!!! info "Note" - Data is converted to Arrow before being written to disk. For maximum control over how data is saved, either provide the PyArrow schema to convert to or else provide a PyArrow Table directly. - -The **`vector`** column needs to be a [Vector](../python/pydantic.md#vector-field) (defined as [pyarrow.FixedSizeList](https://arrow.apache.org/docs/python/generated/pyarrow.list_.html)) type. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_custom_schema" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_custom_schema" - ``` - -### From a Polars DataFrame - -LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library -written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow -under the hood. A deeper integration between LanceDB Tables and Polars DataFrames -is on the way. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-polars" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_from_polars" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-polars" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_from_polars" - ``` - -### From an Arrow Table -You can also create LanceDB tables directly from Arrow tables. -LanceDB supports float16 data type! - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-numpy" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_from_arrow_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-polars" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-numpy" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_from_arrow_table" - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:create_f16_table" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:create_f16_table" - ``` - -### From Pydantic Models - -When you create an empty table without data, you must specify the table schema. -LanceDB supports creating tables by specifying a PyArrow schema or a specialized -Pydantic model called `LanceModel`. - -For example, the following Content model specifies a table with 5 columns: -`movie_id`, `vector`, `genres`, `title`, and `imdb_id`. When you create a table, you can -pass the class as the value of the `schema` parameter to `create_table`. -The `vector` column is a `Vector` type, which is a specialized Pydantic type that -can be configured with the vector dimensions. It is also important to note that -LanceDB only understands subclasses of `lancedb.pydantic.LanceModel` -(which itself derives from `pydantic.BaseModel`). - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:class-Content" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_from_pydantic" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:class-Content" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_from_pydantic" - ``` - -#### Nested schemas - -Sometimes your data model may contain nested objects. -For example, you may want to store the document string -and the document source name as a nested Document object: - -```python ---8<-- "python/python/tests/docs/test_guide_tables.py:import-pydantic-basemodel" ---8<-- "python/python/tests/docs/test_guide_tables.py:class-Document" -``` - -This can be used as the type of a LanceDB table column: - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:class-NestedSchema" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_nested_schema" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:class-NestedSchema" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_nested_schema" - ``` -This creates a struct column called "document" that has two subfields -called "content" and "source": - -``` -In [28]: tbl.schema -Out[28]: -id: string not null -vector: fixed_size_list[1536] not null - child 0, item: float -document: struct not null - child 0, content: string not null - child 1, source: string not null -``` - -#### Validators - -Note that neither Pydantic nor PyArrow automatically validates that input data -is of the correct timezone, but this is easy to add as a custom field validator: - -```python -from datetime import datetime -from zoneinfo import ZoneInfo - -from lancedb.pydantic import LanceModel -from pydantic import Field, field_validator, ValidationError, ValidationInfo - -tzname = "America/New_York" -tz = ZoneInfo(tzname) - -class TestModel(LanceModel): - dt_with_tz: datetime = Field(json_schema_extra={"tz": tzname}) - - @field_validator('dt_with_tz') - @classmethod - def tz_must_match(cls, dt: datetime) -> datetime: - assert dt.tzinfo == tz - return dt - -ok = TestModel(dt_with_tz=datetime.now(tz)) - -try: - TestModel(dt_with_tz=datetime.now(ZoneInfo("Asia/Shanghai"))) - assert 0 == 1, "this should raise ValidationError" -except ValidationError: - print("A ValidationError was raised.") - pass -``` - -When you run this code it should print "A ValidationError was raised." - -#### Pydantic custom types - -LanceDB does NOT yet support converting pydantic custom types. If this is something you need, -please file a feature request on the [LanceDB Github repo](https://github.com/lancedb/lancedb/issues/new). - -### Using Iterators / Writing Large Datasets - -It is recommended to use iterators to add large datasets in batches when creating your table in one go. This does not create multiple versions of your dataset unlike manually adding batches using `table.add()` - -LanceDB additionally supports PyArrow's `RecordBatch` Iterators or other generators producing supported data types. - -Here's an example using using `RecordBatch` iterator for creating tables. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:make_batches" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_from_batch" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:make_batches" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_from_batch" - ``` - -You can also use iterators of other types like Pandas DataFrame or Pylists directly in the above example. - -## Open existing tables - -=== "Python" - If you forget the name of your table, you can always get a listing of all table names. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:list_tables" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:list_tables_async" - ``` - - Then, you can open any existing tables. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:open_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:open_table_async" - ``` - -=== "Typescript[^1]" - - If you forget the name of your table, you can always get a listing of all table names. - - ```typescript - console.log(await db.tableNames()); - ``` - - Then, you can open any existing tables. - - ```typescript - const tbl = await db.openTable("my_table"); - ``` - -## Creating empty table -You can create an empty table for scenarios where you want to add data to the table later. An example would be when you want to collect data from a stream/external file and then add it to a table in batches. - -=== "Python" - - - An empty table can be initialized via a PyArrow schema. - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_empty_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_empty_table_async" - ``` - - Alternatively, you can also use Pydantic to specify the schema for the empty table. Note that we do not - directly import `pydantic` but instead use `lancedb.pydantic` which is a subclass of `pydantic.BaseModel` - that has been extended to support LanceDB specific types like `Vector`. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_guide_tables.py:class-Item" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_empty_table_pydantic" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_guide_tables.py:class-Item" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_empty_table_async_pydantic" - ``` - - Once the empty table has been created, you can add data to it via the various methods listed in the [Adding to a table](#adding-to-a-table) section. - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:create_empty_table" - ``` - - === "vectordb (deprecated)" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:create_empty_table" - ``` - -## Adding to a table - -After a table has been created, you can always add more data to it using the `add` method - -=== "Python" - You can add any of the valid data structures accepted by LanceDB table, i.e, `dict`, `list[dict]`, `pd.DataFrame`, or `Iterator[pa.RecordBatch]`. Below are some examples. - - ### Add a Pandas DataFrame - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_from_pandas" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_async_from_pandas" - ``` - - ### Add a Polars DataFrame - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_from_polars" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_async_from_polars" - ``` - - ### Add an Iterator - - You can also add a large dataset batch in one go using Iterator of any supported data types. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:make_batches_for_add" - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_from_batch" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:make_batches_for_add" - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_async_from_batch" - ``` - - ### Add a PyArrow table - - If you have data coming in as a PyArrow table, you can add it directly to the LanceDB table. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_from_pyarrow" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_async_from_pyarrow" - ``` - - ### Add a Pydantic Model - - Assuming that a table has been created with the correct schema as shown [above](#creating-empty-table), you can add data items that are valid Pydantic models to the table. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_from_pydantic" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:add_table_async_from_pydantic" - ``` - - ??? "Ingesting Pydantic models with LanceDB embedding API" - When using LanceDB's embedding API, you can add Pydantic models directly to the table. LanceDB will automatically convert the `vector` field to a vector before adding it to the table. You need to specify the default value of `vector` field as None to allow LanceDB to automatically vectorize the data. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-embeddings" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_with_embedding" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-embeddings" - --8<-- "python/python/tests/docs/test_guide_tables.py:create_table_async_with_embedding" - ``` - -=== "Typescript[^1]" - - ```javascript - await tbl.add( - [ - {vector: [1.3, 1.4], item: "fizz", price: 100.0}, - {vector: [9.5, 56.2], item: "buzz", price: 200.0} - ] - ) - ``` - -## Upserting into a table - -Upserting lets you insert new rows or update existing rows in a table. To upsert -in LanceDB, use the merge insert API. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:upsert_basic" - ``` - **API Reference**: [lancedb.table.Table.merge_insert][] - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:upsert_basic_async" - ``` - **API Reference**: [lancedb.table.AsyncTable.merge_insert][] - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/merge_insert.test.ts:upsert_basic" - ``` - **API Reference**: [lancedb.Table.mergeInsert](../js/classes/Table.md/#mergeInsert) - -Read more in the guide on [merge insert](tables/merge_insert.md). - -## Deleting from a table - -Use the `delete()` method on tables to delete rows from a table. To choose which rows to delete, provide a filter that matches on the metadata columns. This can delete any number of rows that match the filter. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:delete_row" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:delete_row_async" - ``` - - ### Deleting row with specific column value - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:delete_specific_row" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:delete_specific_row_async" - ``` - - ### Delete from a list of values - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:delete_list_values" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:delete_list_values_async" - ``` - -=== "Typescript[^1]" - - ```ts - await tbl.delete('item = "fizz"') - ``` - - ### Deleting row with specific column value - - ```ts - const con = await lancedb.connect("./.lancedb") - const data = [ - {id: 1, vector: [1, 2]}, - {id: 2, vector: [3, 4]}, - {id: 3, vector: [5, 6]}, - ]; - const tbl = await con.createTable("my_table", data) - await tbl.delete("id = 2") - await tbl.countRows() // Returns 2 - ``` - - ### Delete from a list of values - - ```ts - const to_remove = [1, 5]; - await tbl.delete(`id IN (${to_remove.join(",")})`) - await tbl.countRows() // Returns 1 - ``` - -## Updating a table - -This can be used to update zero to all rows depending on how many rows match the where clause. The update queries follow the form of a SQL UPDATE statement. The `where` parameter is a SQL filter that matches on the metadata columns. The `values` or `values_sql` parameters are used to provide the new values for the columns. - -| Parameter | Type | Description | -|---|---|---| -| `where` | `str` | The SQL where clause to use when updating rows. For example, `'x = 2'` or `'x IN (1, 2, 3)'`. The filter must not be empty, or it will error. | -| `values` | `dict` | The values to update. The keys are the column names and the values are the values to set. | -| `values_sql` | `dict` | The values to update. The keys are the column names and the values are the SQL expressions to set. For example, `{'x': 'x + 1'}` will increment the value of the `x` column by 1. | - -!!! info "SQL syntax" - - See [SQL filters](../sql.md) for more information on the supported SQL syntax. - -!!! warning "Warning" - - Updating nested columns is not yet supported. - -=== "Python" - - API Reference: [lancedb.table.Table.update][] - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pandas" - --8<-- "python/python/tests/docs/test_guide_tables.py:update_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pandas" - --8<-- "python/python/tests/docs/test_guide_tables.py:update_table_async" - ``` - - Output - ```shell - x vector - 0 1 [1.0, 2.0] - 1 3 [5.0, 6.0] - 2 2 [10.0, 10.0] - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - API Reference: [lancedb.Table.update](../js/classes/Table.md/#update) - - ```ts - import * as lancedb from "@lancedb/lancedb"; - - const db = await lancedb.connect("./.lancedb"); - - const data = [ - {x: 1, vector: [1, 2]}, - {x: 2, vector: [3, 4]}, - {x: 3, vector: [5, 6]}, - ]; - const tbl = await db.createTable("my_table", data) - - await tbl.update({ - values: { vector: [10, 10] }, - where: "x = 2" - }); - ``` - - === "vectordb (deprecated)" - - API Reference: [vectordb.Table.update](../javascript/interfaces/Table.md/#update) - - ```ts - const lancedb = require("vectordb"); - - const db = await lancedb.connect("./.lancedb"); - - const data = [ - {x: 1, vector: [1, 2]}, - {x: 2, vector: [3, 4]}, - {x: 3, vector: [5, 6]}, - ]; - const tbl = await db.createTable("my_table", data) - - await tbl.update({ - where: "x = 2", - values: { vector: [10, 10] } - }); - ``` - -#### Updating using a sql query - - The `values` parameter is used to provide the new values for the columns as literal values. You can also use the `values_sql` / `valuesSql` parameter to provide SQL expressions for the new values. For example, you can use `values_sql="x + 1"` to increment the value of the `x` column by 1. - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:update_table_sql" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:update_table_sql_async" - ``` - - Output - ```shell - x vector - 0 2 [1.0, 2.0] - 1 4 [5.0, 6.0] - 2 3 [10.0, 10.0] - ``` - -=== "Typescript[^1]" - - === "@lancedb/lancedb" - - Coming Soon! - - === "vectordb (deprecated)" - - ```ts - await tbl.update({ valuesSql: { x: "x + 1" } }) - ``` - -!!! info "Note" - - When rows are updated, they are moved out of the index. The row will still show up in ANN queries, but the query will not be as fast as it would be if the row was in the index. If you update a large proportion of rows, consider rebuilding the index afterwards. - -## Drop a table - -Use the `drop_table()` method on the database to remove a table. - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_table" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_table_async" - ``` - - This permanently removes the table and is not recoverable, unlike deleting rows. - By default, if the table does not exist an exception is raised. To suppress this, - you can pass in `ignore_missing=True`. - -=== "TypeScript" - - ```typescript - --8<-- "docs/src/basic_legacy.ts:drop_table" - ``` - - This permanently removes the table and is not recoverable, unlike deleting rows. - If the table does not exist an exception is raised. - -## Changing schemas - -While tables must have a schema specified when they are created, you can -change the schema over time. There's three methods to alter the schema of -a table: - -* `add_columns`: Add new columns to the table -* `alter_columns`: Alter the name, nullability, or data type of a column -* `drop_columns`: Drop columns from the table - -### Adding new columns - -You can add new columns to the table with the `add_columns` method. New columns -are filled with values based on a SQL expression. For example, you can add a new -column `y` to the table, fill it with the value of `x * 2` and set the expected -data type for it. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:add_columns" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:add_columns_async" - ``` - **API Reference:** [lancedb.table.Table.add_columns][] - -=== "Typescript" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:add_columns" - ``` - **API Reference:** [lancedb.Table.addColumns](../js/classes/Table.md/#addcolumns) - -If you want to fill it with null, you can use `cast(NULL as )` as -the SQL expression to fill the column with nulls, while controlling the data -type of the column. Available data types are base on the -[DataFusion data types](https://datafusion.apache.org/user-guide/sql/data_types.html). -You can use any of the SQL types, such as `BIGINT`: - -```sql -cast(NULL as BIGINT) -``` - -Using Arrow data types and the `arrow_typeof` function is not yet supported. - - - -### Altering existing columns - -You can alter the name, nullability, or data type of a column with the `alter_columns` -method. - -Changing the name or nullability of a column just updates the metadata. Because -of this, it's a fast operation. Changing the data type of a column requires -rewriting the column, which can be a heavy operation. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_basic.py:alter_columns" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_basic.py:alter_columns_async" - ``` - **API Reference:** [lancedb.table.Table.alter_columns][] - -=== "Typescript" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:alter_columns" - ``` - **API Reference:** [lancedb.Table.alterColumns](../js/classes/Table.md/#altercolumns) - -### Dropping columns - -You can drop columns from the table with the `drop_columns` method. This will -will remove the column from the schema. - - - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_columns" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_columns_async" - ``` - **API Reference:** [lancedb.table.Table.drop_columns][] - -=== "Typescript" - - ```typescript - --8<-- "nodejs/examples/basic.test.ts:drop_columns" - ``` - **API Reference:** [lancedb.Table.dropColumns](../js/classes/Table.md/#altercolumns) - - -## Handling bad vectors - -In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how -invalid vector values are handled. Invalid vectors are vectors that are not valid -because: - -1. They are the wrong dimension -2. They contain NaN values -3. They are null but are on a non-nullable field - -By default, LanceDB will raise an error if it encounters a bad vector. You can -also choose one of the following options: - -* `drop`: Ignore rows with bad vectors -* `fill`: Replace bad values (NaNs) or missing values (too few dimensions) with - the fill value specified in the `fill_value` parameter. An input like - `[1.0, NaN, 3.0]` will be replaced with `[1.0, 0.0, 3.0]` if `fill_value=0.0`. -* `null`: Replace bad vectors with null (only works if the column is nullable). - A bad vector `[1.0, NaN, 3.0]` will be replaced with `null` if the column is - nullable. If the vector column is non-nullable, then bad vectors will cause an - error - -## Consistency - -In LanceDB OSS, users can set the `read_consistency_interval` parameter on connections to achieve different levels of read consistency. This parameter determines how frequently the database synchronizes with the underlying storage system to check for updates made by other processes. If another process updates a table, the database will not see the changes until the next synchronization. - -There are three possible settings for `read_consistency_interval`: - -1. **Unset (default)**: The database does not check for updates to tables made by other processes. This provides the best query performance, but means that clients may not see the most up-to-date data. This setting is suitable for applications where the data does not change during the lifetime of the table reference. -2. **Zero seconds (Strong consistency)**: The database checks for updates on every read. This provides the strongest consistency guarantees, ensuring that all clients see the latest committed data. However, it has the most overhead. This setting is suitable when consistency matters more than having high QPS. -3. **Custom interval (Eventual consistency)**: The database checks for updates at a custom interval, such as every 5 seconds. This provides eventual consistency, allowing for some lag between write and read operations. Performance wise, this is a middle ground between strong consistency and no consistency check. This setting is suitable for applications where immediate consistency is not critical, but clients should see updated data eventually. - -!!! tip "Consistency in LanceDB Cloud" - - This is only tune-able in LanceDB OSS. In LanceDB Cloud, readers are always eventually consistent. - -=== "Python" - - To set strong consistency, use `timedelta(0)`: - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-datetime" - --8<-- "python/python/tests/docs/test_guide_tables.py:table_strong_consistency" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-datetime" - --8<-- "python/python/tests/docs/test_guide_tables.py:table_async_strong_consistency" - ``` - - For eventual consistency, use a custom `timedelta`: - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-datetime" - --8<-- "python/python/tests/docs/test_guide_tables.py:table_eventual_consistency" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:import-datetime" - --8<-- "python/python/tests/docs/test_guide_tables.py:table_async_eventual_consistency" - ``` - - By default, a `Table` will never check for updates from other writers. To manually check for updates you can use `checkout_latest`: - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:table_checkout_latest" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_guide_tables.py:table_async_checkout_latest" - ``` - -=== "Typescript[^1]" - - To set strong consistency, use `0`: - - ```ts - const db = await lancedb.connect({ uri: "./.lancedb", readConsistencyInterval: 0 }); - const tbl = await db.openTable("my_table"); - ``` - - For eventual consistency, specify the update interval as seconds: - - ```ts - const db = await lancedb.connect({ uri: "./.lancedb", readConsistencyInterval: 5 }); - const tbl = await db.openTable("my_table"); - ``` - - - -## What's next? - -Learn the best practices on creating an ANN index and getting the most out of it. - -[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](../migration.md) for more information. diff --git a/docs/src/guides/tables/merge_insert.md b/docs/src/guides/tables/merge_insert.md deleted file mode 100644 index f6af2fcb..00000000 --- a/docs/src/guides/tables/merge_insert.md +++ /dev/null @@ -1,135 +0,0 @@ -The merge insert command is a flexible API that can be used to perform: - -1. Upsert -2. Insert-if-not-exists -3. Replace range - -It works by joining the input data with the target table on a key you provide. -Often this key is a unique row id key. You can then specify what to do when -there is a match and when there is not a match. For example, for upsert you want -to update if the row has a match and insert if the row doesn't have a match. -Whereas for insert-if-not-exists you only want to insert if the row doesn't have -a match. - -You can also read more in the API reference: - -* Python - * Sync: [lancedb.table.Table.merge_insert][] - * Async: [lancedb.table.AsyncTable.merge_insert][] -* Typescript: [lancedb.Table.mergeInsert](../../js/classes/Table.md/#mergeinsert) - -!!! tip "Use scalar indices to speed up merge insert" - - The merge insert command needs to perform a join between the input data and the - target table on the `on` key you provide. This requires scanning that entire - column, which can be expensive for large tables. To speed up this operation, - you can create a scalar index on the `on` column, which will allow LanceDB to - find matches without having to scan the whole tables. - - Read more about scalar indices in [Building a Scalar Index](../scalar_index.md) - guide. - -!!! info "Embedding Functions" - - Like the create table and add APIs, the merge insert API will automatically - compute embeddings if the table has a embedding definition in its schema. - If the input data doesn't contain the source column, or the vector column - is already filled, then the embeddings won't be computed. See the - [Embedding Functions](../../embeddings/embedding_functions.md) guide for more - information. - -## Upsert - -Upsert updates rows if they exist and inserts them if they don't. To do this -with merge insert, enable both `when_matched_update_all()` and -`when_not_matched_insert_all()`. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:upsert_basic" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:upsert_basic_async" - ``` - -=== "Typescript" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/merge_insert.test.ts:upsert_basic" - ``` - -!!! note "Providing subsets of columns" - - If a column is nullable, it can be omitted from input data and it will be - considered `null`. Columns can also be provided in any order. - -## Insert-if-not-exists - -To avoid inserting duplicate rows, you can use the insert-if-not-exists command. -This will only insert rows that do not have a match in the target table. To do -this with merge insert, enable just `when_not_matched_insert_all()`. - - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:insert_if_not_exists" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:insert_if_not_exists_async" - ``` - -=== "Typescript" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/merge_insert.test.ts:insert_if_not_exists" - ``` - - -## Replace range - -You can also replace a range of rows in the target table with the input data. -For example, if you have a table of document chunks, where each chunk has -both a `doc_id` and a `chunk_id`, you can replace all chunks for a given -`doc_id` with updated chunks. This can be tricky otherwise because if you -try to use upsert when the new data has fewer chunks you will end up with -extra chunks. To avoid this, add another clause to delete any chunks for -the document that are not in the new data, with -`when_not_matched_by_source_delete`. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:replace_range" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_merge_insert.py:replace_range_async" - ``` - -=== "Typescript" - - === "@lancedb/lancedb" - - ```typescript - --8<-- "nodejs/examples/merge_insert.test.ts:replace_range" - ``` diff --git a/docs/src/guides/tuning_retrievers/1_query_types.md b/docs/src/guides/tuning_retrievers/1_query_types.md deleted file mode 100644 index a43295ca..00000000 --- a/docs/src/guides/tuning_retrievers/1_query_types.md +++ /dev/null @@ -1,131 +0,0 @@ -## Improving retriever performance - -Try it yourself: Open In Colab
- -VectorDBs are used as retrievers in recommender or chatbot-based systems for retrieving relevant data based on user queries. For example, retrievers are a critical component of Retrieval Augmented Generation (RAG) acrhitectures. In this section, we will discuss how to improve the performance of retrievers. - -There are serveral ways to improve the performance of retrievers. Some of the common techniques are: - -* Using different query types -* Using hybrid search -* Fine-tuning the embedding models -* Using different embedding models - -Using different embedding models is something that's very specific to the use case and the data. So we will not discuss it here. In this section, we will discuss the first three techniques. - - -!!! note "Note" - We'll be using a simple metric called "hit-rate" for evaluating the performance of the retriever across this guide. Hit-rate is the percentage of queries for which the retriever returned the correct answer in the top-k results. For example, if the retriever returned the correct answer in the top-3 results for 70% of the queries, then the hit-rate@3 is 0.7. - - -## The dataset -We'll be using a QA dataset generated using a LLama2 review paper. The dataset contains 221 query, context and answer triplets. The queries and answers are generated using GPT-4 based on a given query. Full script used to generate the dataset can be found on this [repo](https://github.com/lancedb/ragged). It can be downloaded from [here](https://github.com/AyushExel/assets/blob/main/data_qa.csv). - -### Using different query types -Let's setup the embeddings and the dataset first. We'll use the LanceDB's `huggingface` embeddings integration for this guide. - -```python -import lancedb -import pandas as pd -from lancedb.embeddings import get_registry -from lancedb.pydantic import Vector, LanceModel - -db = lancedb.connect("~/lancedb/query_types") -df = pd.read_csv("data_qa.csv") - -embed_fcn = get_registry().get("huggingface").create(name="BAAI/bge-small-en-v1.") - -class Schema(LanceModel): - context: str = embed_fcn.SourceField() - vector: Vector(embed_fcn.ndims()) = embed_fcn.VectorField() - -table = db.create_table("qa", schema=Schema) -table.add(df[["context"]].to_dict(orient="records")) - -queries = df["query"].tolist() -``` - -Now that we have the dataset and embeddings table set up, here's how you can run different query types on the dataset: - -* Vector Search: - - ```python - table.search(quries[0], query_type="vector").limit(5).to_pandas() - ``` - By default, LanceDB uses vector search query type for searching and it automatically converts the input query to a vector before searching when using embedding API. So, the following statement is equivalent to the above statement: - - ```python - table.search(quries[0]).limit(5).to_pandas() - ``` - - Vector or semantic search is useful when you want to find documents that are similar to the query in terms of meaning. - ---- - -* Full-text Search: - - FTS requires creating an index on the column you want to search on. `replace=True` will replace the existing index if it exists. - Once the index is created, you can search using the `fts` query type. - ```python - table.create_fts_index("context", replace=True) - table.search(quries[0], query_type="fts").limit(5).to_pandas() - ``` - - Full-text search is useful when you want to find documents that contain the query terms. - ---- - -* Hybrid Search: - - Hybrid search is a combination of vector and full-text search. Here's how you can run a hybrid search query on the dataset: - ```python - table.search(quries[0], query_type="hybrid").limit(5).to_pandas() - ``` - Hybrid search requires a reranker to combine and rank the results from vector and full-text search. We'll cover reranking as a concept in the next section. - - Hybrid search is useful when you want to combine the benefits of both vector and full-text search. - - !!! note "Note" - By default, it uses `LinearCombinationReranker` that combines the scores from vector and full-text search using a weighted linear combination. It is the simplest reranker implementation available in LanceDB. You can also use other rerankers like `CrossEncoderReranker` or `CohereReranker` for reranking the results. - Learn more about rerankers [here](https://lancedb.github.io/lancedb/reranking/). - - - -### Hit rate evaluation results - -Now that we have seen how to run different query types on the dataset, let's evaluate the hit-rate of each query type on the dataset. -For brevity, the entire evaluation script is not shown here. You can find the complete evaluation and benchmarking utility scripts [here](https://github.com/lancedb/ragged). - -Here are the hit-rate results for the dataset: - -| Query Type | Hit-rate@5 | -| --- | --- | -| Vector Search | 0.640 | -| Full-text Search | 0.595 | -| Hybrid Search (w/ LinearCombinationReranker) | 0.645 | - -**Choosing query type** is very specific to the use case and the data. This synthetic dataset has been generated to be semantically challenging, i.e, the queries don't have a lot of keywords in common with the context. So, vector search performs better than full-text search. However, in real-world scenarios, full-text search might perform better than vector search. Hybrid search is a good choice when you want to combine the benefits of both vector and full-text search. - -### Evaluation results on other datasets - -The hit-rate results can vary based on the dataset and the query type. Here are the hit-rate results for the other datasets using the same embedding function. - -* SQuAD Dataset: - - | Query Type | Hit-rate@5 | - | --- | --- | - | Vector Search | 0.822 | - | Full-text Search | 0.835 | - | Hybrid Search (w/ LinearCombinationReranker) | 0.8874 | - -* Uber10K sec filing Dataset: - - | Query Type | Hit-rate@5 | - | --- | --- | - | Vector Search | 0.608 | - | Full-text Search | 0.82 | - | Hybrid Search (w/ LinearCombinationReranker) | 0.80 | - -In these standard datasets, FTS seems to perform much better than vector search because the queries have a lot of keywords in common with the context. So, in general choosing the query type is very specific to the use case and the data. - - diff --git a/docs/src/guides/tuning_retrievers/2_reranking.md b/docs/src/guides/tuning_retrievers/2_reranking.md deleted file mode 100644 index c5b970c3..00000000 --- a/docs/src/guides/tuning_retrievers/2_reranking.md +++ /dev/null @@ -1,77 +0,0 @@ -Continuing from the previous section, we can now rerank the results using more complex rerankers. - -Try it yourself: Open In Colab
- -## Reranking search results -You can rerank any search results using a reranker. The syntax for reranking is as follows: - -```python -from lancedb.rerankers import LinearCombinationReranker - -reranker = LinearCombinationReranker() -table.search(quries[0], query_type="hybrid").rerank(reranker=reranker).limit(5).to_pandas() -``` -Based on the `query_type`, the `rerank()` function can accept other arguments as well. For example, hybrid search accepts a `normalize` param to determine the score normalization method. - -!!! note "Note" - LanceDB provides a `Reranker` base class that can be extended to implement custom rerankers. Each reranker must implement the `rerank_hybrid` method. `rerank_vector` and `rerank_fts` methods are optional. For example, the `LinearCombinationReranker` only implements the `rerank_hybrid` method and so it can only be used for reranking hybrid search results. - -## Choosing a Reranker -There are many rerankers available in LanceDB like `CrossEncoderReranker`, `CohereReranker`, and `ColBERT`. The choice of reranker depends on the dataset and the application. You can even implement you own custom reranker by extending the `Reranker` class. For more details about each available reranker and performance comparison, refer to the [rerankers](https://lancedb.github.io/lancedb/reranking/) documentation. - -In this example, we'll use the `CohereReranker` to rerank the search results. It requires `cohere` to be installed and `COHERE_API_KEY` to be set in the environment. To get your API key, sign up on [Cohere](https://cohere.ai/). - -```python -from lancedb.rerankers import CohereReranker - -# use Cohere reranker v3 -reranker = CohereReranker(model_name="rerank-english-v3.0") # default model is "rerank-english-v2.0" -``` - -### Reranking search results -Now we can rerank all query type results using the `CohereReranker`: - -```python - -# rerank hybrid search results -table.search(quries[0], query_type="hybrid").rerank(reranker=reranker).limit(5).to_pandas() - -# rerank vector search results -table.search(quries[0], query_type="vector").rerank(reranker=reranker).limit(5).to_pandas() - -# rerank fts search results -table.search(quries[0], query_type="fts").rerank(reranker=reranker).limit(5).to_pandas() -``` - -Each reranker can accept additional arguments. For example, `CohereReranker` accepts `top_k` and `batch_size` params to control the number of documents to rerank and the batch size for reranking respectively. Similarly, a custom reranker can accept any number of arguments based on the implementation. For example, a reranker can accept a `filter` that implements some custom logic to filter out documents before reranking. - -## Results - -Let us take a look at the same datasets from the previous sections, using the same embedding table but with Cohere reranker applied to all query types. - -!!! note "Note" - When reranking fts or vector search results, the search results are over-fetched by a factor of 2 and then reranked. From the reranked set, `top_k` (5 in this case) results are taken. This is done because reranking will have no effect on the hit-rate if we only fetch the `top_k` results. - -### Synthetic LLama2 paper dataset - -| Query Type | Hit-rate@5 | -| --- | --- | -| Vector | 0.640 | -| FTS | 0.595 | -| Reranked vector | 0.677 | -| Reranked fts | 0.672 | -| Hybrid | 0.759 | - -### Uber10K sec filing Dataset - -| Query Type | Hit-rate@5 | -| --- | --- | -| Vector | 0.608 | -| FTS | 0.824 | -| Reranked vector | 0.671 | -| Reranked fts | 0.843 | -| Hybrid | 0.849 | - - - - diff --git a/docs/src/guides/tuning_retrievers/3_embed_tuning.md b/docs/src/guides/tuning_retrievers/3_embed_tuning.md deleted file mode 100644 index 29a5dd12..00000000 --- a/docs/src/guides/tuning_retrievers/3_embed_tuning.md +++ /dev/null @@ -1,82 +0,0 @@ -## Finetuning the Embedding Model -Try it yourself: Open In Colab
- -Another way to improve retriever performance is to fine-tune the embedding model itself. Fine-tuning the embedding model can help in learning better representations for the documents and queries in the dataset. This can be particularly useful when the dataset is very different from the pre-trained data used to train the embedding model. - -We'll use the same dataset as in the previous sections. Start off by splitting the dataset into training and validation sets: -```python -from sklearn.model_selection import train_test_split - -train_df, validation_df = train_test_split("data_qa.csv", test_size=0.2, random_state=42) - -train_df.to_csv("data_train.csv", index=False) -validation_df.to_csv("data_val.csv", index=False) -``` - -You can use any tuning API to fine-tune embedding models. In this example, we'll utilise Llama-index as it also comes with utilities for synthetic data generation and training the model. - - -We parse the dataset as llama-index text nodes and generate synthetic QA pairs from each node: -```python -from llama_index.core.node_parser import SentenceSplitter -from llama_index.readers.file import PagedCSVReader -from llama_index.finetuning import generate_qa_embedding_pairs -from llama_index.core.evaluation import EmbeddingQAFinetuneDataset - -def load_corpus(file): - loader = PagedCSVReader(encoding="utf-8") - docs = loader.load_data(file=Path(file)) - - parser = SentenceSplitter() - nodes = parser.get_nodes_from_documents(docs) - - return nodes - -from llama_index.llms.openai import OpenAI - - -train_dataset = generate_qa_embedding_pairs( - llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes, verbose=False -) -val_dataset = generate_qa_embedding_pairs( - llm=OpenAI(model="gpt-3.5-turbo"), nodes=val_nodes, verbose=False -) -``` - -Now we'll use `SentenceTransformersFinetuneEngine` engine to fine-tune the model. You can also use `sentence-transformers` or `transformers` library to fine-tune the model: - -```python -from llama_index.finetuning import SentenceTransformersFinetuneEngine - -finetune_engine = SentenceTransformersFinetuneEngine( - train_dataset, - model_id="BAAI/bge-small-en-v1.5", - model_output_path="tuned_model", - val_dataset=val_dataset, -) -finetune_engine.finetune() -embed_model = finetune_engine.get_finetuned_model() -``` -This saves the fine tuned embedding model in `tuned_model` folder. - -# Evaluation results -In order to eval the retriever, you can either use this model to ingest the data into LanceDB directly or llama-index's LanceDB integration to create a `VectorStoreIndex` and use it as a retriever. -On performing the same hit-rate evaluation as before, we see a significant improvement in the hit-rate across all query types. - -### Baseline -| Query Type | Hit-rate@5 | -| --- | --- | -| Vector Search | 0.640 | -| Full-text Search | 0.595 | -| Reranked Vector Search | 0.677 | -| Reranked Full-text Search | 0.672 | -| Hybrid Search (w/ CohereReranker) | 0.759| - -### Fine-tuned model ( 2 iterations ) -| Query Type | Hit-rate@5 | -| --- | --- | -| Vector Search | 0.672 | -| Full-text Search | 0.595 | -| Reranked Vector Search | 0.754 | -| Reranked Full-text Search | 0.672| -| Hybrid Search (w/ CohereReranker) | 0.768 | diff --git a/docs/src/hybrid_search/eval.md b/docs/src/hybrid_search/eval.md deleted file mode 100644 index 0dd22fb1..00000000 --- a/docs/src/hybrid_search/eval.md +++ /dev/null @@ -1,51 +0,0 @@ -# Hybrid Search - -Hybrid Search is a broad (often misused) term. It can mean anything from combining multiple methods for searching, to applying ranking methods to better sort the results. In this blog, we use the definition of "hybrid search" to mean using a combination of keyword-based and vector search. - -## The challenge of (re)ranking search results -Once you have a group of the most relevant search results from multiple search sources, you'd likely standardize the score and rank them accordingly. This process can also be seen as another independent step: reranking. -There are two approaches for reranking search results from multiple sources. - -* Score-based: Calculate final relevance scores based on a weighted linear combination of individual search algorithm scores. Example: Weighted linear combination of semantic search & keyword-based search results. - -* Relevance-based: Discards the existing scores and calculates the relevance of each search result-query pair. Example: Cross Encoder models - -Even though there are many strategies for reranking search results, none works for all cases. Moreover, evaluating them itself is a challenge. Also, reranking can be dataset or application specific so it's hard to generalize. - -### Example evaluation of hybrid search with Reranking - -Here's some evaluation numbers from an experiment comparing these rerankers on about 800 queries. It is modified version of an evaluation script from [llama-index](https://github.com/run-llama/finetune-embedding/blob/main/evaluate.ipynb) that measures hit-rate at top-k. - - With OpenAI ada2 embedding - -Vector Search baseline: `0.64` - -| Reranker | Top-3 | Top-5 | Top-10 | -| --- | --- | --- | --- | -| Linear Combination | `0.73` | `0.74` | `0.85` | -| Cross Encoder | `0.71` | `0.70` | `0.77` | -| Cohere | `0.81` | `0.81` | `0.85` | -| ColBERT | `0.68` | `0.68` | `0.73` | - -

- -

- - With OpenAI embedding-v3-small - -Vector Search baseline: `0.59` - -| Reranker | Top-3 | Top-5 | Top-10 | -| --- | --- | --- | --- | -| Linear Combination | `0.68` | `0.70` | `0.84` | -| Cross Encoder | `0.72` | `0.72` | `0.79` | -| Cohere | `0.79` | `0.79` | `0.84` | -| ColBERT | `0.70` | `0.70` | `0.76` | - -

- -

- -### Conclusion - -The results show that the reranking methods are able to improve the search results. However, the improvement is not consistent across all rerankers. The choice of reranker depends on the dataset and the application. It is also important to note that the reranking methods are not a replacement for the search methods. They are complementary and should be used together to get the best results. The speed to recall tradeoff is also an important factor to consider when choosing the reranker. diff --git a/docs/src/hybrid_search/hybrid_search.md b/docs/src/hybrid_search/hybrid_search.md deleted file mode 100644 index 89f74fc4..00000000 --- a/docs/src/hybrid_search/hybrid_search.md +++ /dev/null @@ -1,63 +0,0 @@ -# Hybrid Search - -LanceDB supports both semantic and keyword-based search (also termed full-text search, or FTS). In real world applications, it is often useful to combine these two approaches to get the best best results. For example, you may want to search for a document that is semantically similar to a query document, but also contains a specific keyword. This is an example of *hybrid search*, a search algorithm that combines multiple search techniques. - -## Hybrid search in LanceDB -You can perform hybrid search in LanceDB by combining the results of semantic and full-text search via a reranking algorithm of your choice. LanceDB provides multiple rerankers out of the box. However, you can always write a custom reranker if your use case need more sophisticated logic . - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:import-os" - --8<-- "python/python/tests/docs/test_search.py:import-openai" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb" - --8<-- "python/python/tests/docs/test_search.py:import-embeddings" - --8<-- "python/python/tests/docs/test_search.py:import-pydantic" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb-fts" - --8<-- "python/python/tests/docs/test_search.py:import-openai-embeddings" - --8<-- "python/python/tests/docs/test_search.py:class-Documents" - --8<-- "python/python/tests/docs/test_search.py:basic_hybrid_search" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:import-os" - --8<-- "python/python/tests/docs/test_search.py:import-openai" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb" - --8<-- "python/python/tests/docs/test_search.py:import-embeddings" - --8<-- "python/python/tests/docs/test_search.py:import-pydantic" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb-fts" - --8<-- "python/python/tests/docs/test_search.py:import-openai-embeddings" - --8<-- "python/python/tests/docs/test_search.py:class-Documents" - --8<-- "python/python/tests/docs/test_search.py:basic_hybrid_search_async" - ``` - -!!! Note - You can also pass the vector and text query manually. This is useful if you're not using the embedding API or if you're using a separate embedder service. -### Explicitly passing the vector and text query -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:hybrid_search_pass_vector_text" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:hybrid_search_pass_vector_text_async" - ``` - -By default, LanceDB uses `RRFReranker()`, which uses reciprocal rank fusion score, to combine and rerank the results of semantic and full-text search. You can customize the hyperparameters as needed or write your own custom reranker. Here's how you can use any of the available rerankers: - - -### `rerank()` arguments -* `normalize`: `str`, default `"score"`: - The method to normalize the scores. Can be "rank" or "score". If "rank", the scores are converted to ranks and then normalized. If "score", the scores are normalized directly. -* `reranker`: `Reranker`, default `RRF()`. - The reranker to use. If not specified, the default reranker is used. - - -## Available Rerankers -LanceDB provides a number of rerankers out of the box. You can use any of these rerankers by passing them to the `rerank()` method. -Go to [Rerankers](../reranking/index.md) to learn more about using the available rerankers and implementing custom rerankers. - - diff --git a/docs/src/index.md b/docs/src/index.md index 4e45297c..d8da266a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,61 +1,7 @@ -# LanceDB +# API Reference -LanceDB is an open-source vector database for AI that's designed to store, manage, query and retrieve embeddings on large-scale multi-modal data. The core of LanceDB is written in Rust 🦀 and is built on top of [Lance](https://github.com/lancedb/lance), an open-source columnar data format designed for performant ML workloads and fast random access. +This page contains the API reference for the SDKs supported by the LanceDB team. -Both the database and the underlying data format are designed from the ground up to be **easy-to-use**, **scalable** and **cost-effective**. - -!!! tip "Hosted LanceDB" - If you want S3 cost-efficiency and local performance via a simple serverless API, checkout **LanceDB Cloud**. For private deployments, high performance at extreme scale, or if you have strict security requirements, talk to us about **LanceDB Enterprise**. [Learn more](https://docs.lancedb.com/) - -![](assets/lancedb_and_lance.png) - -## Truly multi-modal - -Most existing vector databases that store and query just the embeddings and their metadata. The actual data is stored elsewhere, requiring you to manage their storage and versioning separately. - -LanceDB supports storage of the *actual data itself*, alongside the embeddings and metadata. You can persist your images, videos, text documents, audio files and more in the Lance format, which provides automatic data versioning and blazing fast retrievals and filtering via LanceDB. - -## Open-source and cloud solutions - -LanceDB is available in two flavors: **OSS** and **Cloud**. - -LanceDB **OSS** is an **open-source**, batteries-included embedded vector database that you can run on your own infrastructure. "Embedded" means that it runs *in-process*, making it incredibly simple to self-host your own AI retrieval workflows for RAG and more. No servers, no hassle. - -LanceDB **Cloud** is a SaaS (software-as-a-service) solution that runs serverless in the cloud, making the storage clearly separated from compute. It's designed to be cost-effective and highly scalable without breaking the bank. LanceDB Cloud is currently in private beta with general availability coming soon, but you can apply for early access with the private beta release by signing up below. - -[Try out LanceDB Cloud (Public Beta) Now](https://cloud.lancedb.com){ .md-button .md-button--primary } - -## Why use LanceDB? - -* Embedded (OSS) and serverless (Cloud) - no need to manage servers - -* Fast production-scale vector similarity, full-text & hybrid search and a SQL query interface (via [DataFusion](https://github.com/apache/arrow-datafusion)) - -* Python, Javascript/Typescript, and Rust support - -* Store, query & manage multi-modal data (text, images, videos, point clouds, etc.), not just the embeddings and metadata - -* Tight integration with the [Arrow](https://arrow.apache.org/docs/format/Columnar.html) ecosystem, allowing true zero-copy access in shared memory with SIMD and GPU acceleration - -* Automatic data versioning to manage versions of your data without needing extra infrastructure - -* Disk-based index & storage, allowing for massive scalability without breaking the bank - -* Ingest your favorite data formats directly, like pandas DataFrames, Pydantic objects, Polars (coming soon), and more - -## Documentation guide - -The following pages go deeper into the internal of LanceDB and how to use it. - -* [Quick start](basic.md): Get started with LanceDB and vector DB concepts -* [Vector search concepts](concepts/vector_search.md): Understand the basics of vector search -* [Working with tables](guides/tables.md): Learn how to work with tables and their associated functions -* [Indexing](ann_indexes.md): Understand how to create indexes -* [Vector search](search.md): Learn how to perform vector similarity search -* [Full-text search (native)](fts.md): Learn how to perform full-text search -* [Full-text search (tantivy-based)](fts_tantivy.md): Learn how to perform full-text search using Tantivy -* [Managing embeddings](embeddings/index.md): Managing embeddings and the embedding functions API in LanceDB -* [Ecosystem Integrations](integrations/index.md): Integrate LanceDB with other tools in the data ecosystem -* [Python API Reference](python/python.md): Python OSS and Cloud API references -* [JavaScript API Reference](javascript/modules.md): JavaScript OSS and Cloud API references -* [Rust API Reference](https://docs.rs/lancedb/latest/lancedb/index.html): Rust API reference +- [Python](python/python.md) +- [JavaScript/TypeScript](js/globals.md) +- [Rust](https://docs.rs/lancedb/latest/lancedb/index.html) \ No newline at end of file diff --git a/docs/src/integrations/dlt.md b/docs/src/integrations/dlt.md deleted file mode 100644 index 009aa9d9..00000000 --- a/docs/src/integrations/dlt.md +++ /dev/null @@ -1,142 +0,0 @@ -# dlt - -[dlt](https://dlthub.com/docs/intro) is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets. dlt's [integration with LanceDB](https://dlthub.com/docs/dlt-ecosystem/destinations/lancedb) lets you ingest data from any source (databases, APIs, CSVs, dataframes, JSONs, and more) into LanceDB with a few lines of simple python code. The integration enables automatic normalization of nested data, schema inference, incremental loading and embedding the data. dlt also has integrations with several other tools like dbt, airflow, dagster etc. that can be inserted into your LanceDB workflow. - -## How to ingest data into LanceDB - -In this example, we will be fetching movie information from the [Open Movie Database (OMDb) API](https://www.omdbapi.com/) and loading it into a local LanceDB instance. To implement it, you will need an API key for the OMDb API (which can be created freely [here](https://www.omdbapi.com/apikey.aspx)). - -1. **Install `dlt` with LanceDB extras:** - ```sh - pip install dlt[lancedb] - ``` - -2. **Inside an empty directory, initialize a `dlt` project with:** - ```sh - dlt init rest_api lancedb - ``` - This will add all the files necessary to create a `dlt` pipeline that can ingest data from any REST API (ex: OMDb API) and load into LanceDB. - ```text - ├── .dlt - │ ├── config.toml - │ └── secrets.toml - ├── rest_api - ├── rest_api_pipeline.py - └── requirements.txt - ``` - - dlt has a list of pre-built [sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) like [SQL databases](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database), [REST APIs](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api), [Google Sheets](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets), [Notion](https://dlthub.com/docs/dlt-ecosystem/verified-sources/notion) etc., that can be used out-of-the-box by running `dlt init lancedb`. Since dlt is a python library, it is also very easy to modify these pre-built sources or to write your own custom source from scratch. - - -3. **Specify necessary credentials and/or embedding model details:** - - In order to fetch data from the OMDb API, you will need to pass a valid API key into your pipeline. Depending on whether you're using LanceDB OSS or LanceDB cloud, you also may need to provide the necessary credentials to connect to the LanceDB instance. These can be pasted inside `.dlt/sercrets.toml`. - - dlt's LanceDB integration also allows you to automatically embed the data during ingestion. Depending on the embedding model chosen, you may need to paste the necessary credentials inside `.dlt/sercrets.toml`: - ```toml - [sources.rest_api] - api_key = "api_key" # Enter the API key for the OMDb API - - [destination.lancedb] - embedding_model_provider = "sentence-transformers" - embedding_model = "all-MiniLM-L6-v2" - [destination.lancedb.credentials] - uri = ".lancedb" - api_key = "api_key" # API key to connect to LanceDB Cloud. Leave out if you are using LanceDB OSS. - embedding_model_provider_api_key = "embedding_model_provider_api_key" # Not needed for providers that don't need authentication (ollama, sentence-transformers). - ``` - See [here](https://dlthub.com/docs/dlt-ecosystem/destinations/lancedb#configure-the-destination) for more information and for a list of available models and model providers. - - -4. **Write the pipeline code inside `rest_api_pipeline.py`:** - - The following code shows how you can configure dlt's REST API source to connect to the [OMDb API](https://www.omdbapi.com/), fetch all movies with the word "godzilla" in the title, and load it into a LanceDB table. The REST API source allows you to pull data from any API with minimal code, to learn more read the [dlt docs](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api). - - ```python - - # Import necessary modules - import dlt - from rest_api import rest_api_source - - # Configure the REST API source - movies_source = rest_api_source( - { - "client": { - "base_url": "https://www.omdbapi.com/", - "auth": { # authentication strategy for the OMDb API - "type": "api_key", - "name": "apikey", - "api_key": dlt.secrets["sources.rest_api.api_token"], # read API credentials directly from secrets.toml - "location": "query" - }, - "paginator": { # pagination strategy for the OMDb API - "type": "page_number", - "base_page": 1, - "total_path": "totalResults", - "maximum_page": 5 - } - }, - "resources": [ # list of API endpoints to request - { - "name": "movie_search", - "endpoint": { - "path": "/", - "params": { - "s": "godzilla", - "type": "movie" - } - } - } - ] - }) - - - if __name__ == "__main__": - # Create a pipeline object - pipeline = dlt.pipeline( - pipeline_name='movies_pipeline', - destination='lancedb', # this tells dlt to load the data into LanceDB - dataset_name='movies_data_pipeline', - ) - - # Run the pipeline - load_info = pipeline.run(movies_source) - - # pretty print the information on data that was loaded - print(load_info) - ``` - - The script above will ingest the data into LanceDB as it is, i.e. without creating any embeddings. If we want to embed one of the fields (for example, `"Title"` that contains the movie titles), then we will use dlt's `lancedb_adapter` and modify the script as follows: - - - Add the following import statement: - ```python - from dlt.destinations.adapters import lancedb_adapter - ``` - - Modify the pipeline run like this: - ```python - load_info = pipeline.run( - lancedb_adapter( - movies_source, - embed="Title", - ) - ) - ``` - This will use the embedding model specified inside `.dlt/secrets.toml` to embed the field `"Title"`. - -5. **Install necessary dependencies:** - ```sh - pip install -r requirements.txt - ``` - - Note: You may need to install the dependencies for your embedding models separately. - ```sh - pip install sentence-transformers - ``` - -6. **Run the pipeline:** - Finally, running the following command will ingest the data into your LanceDB instance. - ```sh - python custom_source.py - ``` - -For more information and advanced usage of dlt's LanceDB integration, read [the dlt documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/lancedb). diff --git a/docs/src/integrations/genkit.md b/docs/src/integrations/genkit.md deleted file mode 100644 index cd88d5a9..00000000 --- a/docs/src/integrations/genkit.md +++ /dev/null @@ -1,183 +0,0 @@ -### genkitx-lancedb -This is a lancedb plugin for genkit framework. It allows you to use LanceDB for ingesting and rereiving data using genkit framework. - -![integration-banner-genkit](https://github.com/user-attachments/assets/a6cc28af-98e9-4425-b87c-7ab139bd7893) - -### Installation -```bash -pnpm install genkitx-lancedb -``` - -### Usage - -Adding LanceDB plugin to your genkit instance. - -```ts -import { lancedbIndexerRef, lancedb, lancedbRetrieverRef, WriteMode } from 'genkitx-lancedb'; -import { textEmbedding004, vertexAI } from '@genkit-ai/vertexai'; -import { gemini } from '@genkit-ai/vertexai'; -import { z, genkit } from 'genkit'; -import { Document } from 'genkit/retriever'; -import { chunk } from 'llm-chunk'; -import { readFile } from 'fs/promises'; -import path from 'path'; -import pdf from 'pdf-parse/lib/pdf-parse'; - -const ai = genkit({ - plugins: [ - // vertexAI provides the textEmbedding004 embedder - vertexAI(), - - // the local vector store requires an embedder to translate from text to vector - lancedb([ - { - dbUri: '.db', // optional lancedb uri, default to .db - tableName: 'table', // optional table name, default to table - embedder: textEmbedding004, - }, - ]), - ], -}); -``` - -You can run this app with the following command: -```bash -genkit start -- tsx --watch src/index.ts -``` - -This'll add LanceDB as a retriever and indexer to the genkit instance. You can see it in the GUI view -Screenshot 2025-05-11 at 7 21 05 PM - -**Testing retrieval on a sample table** -Let's see the raw retrieval results - -Screenshot 2025-05-11 at 7 21 05 PM -On running this query, you'll 5 results fetched from the lancedb table, where each result looks something like this: -Screenshot 2025-05-11 at 7 21 18 PM - - - -## Creating a custom RAG flow - -Now that we've seen how you can use LanceDB for in a genkit pipeline, let's refine the flow and create a RAG. A RAG flow will consist of an index and a retreiver with its outputs postprocessed an fed into an LLM for final response - -### Creating custom indexer flows -You can also create custom indexer flows, utilizing more options and features provided by LanceDB. - -```ts -export const menuPdfIndexer = lancedbIndexerRef({ - // Using all defaults, for dbUri, tableName, and embedder, etc -}); - -const chunkingConfig = { - minLength: 1000, - maxLength: 2000, - splitter: 'sentence', - overlap: 100, - delimiters: '', -} as any; - - -async function extractTextFromPdf(filePath: string) { - const pdfFile = path.resolve(filePath); - const dataBuffer = await readFile(pdfFile); - const data = await pdf(dataBuffer); - return data.text; -} - -export const indexMenu = ai.defineFlow( - { - name: 'indexMenu', - inputSchema: z.string().describe('PDF file path'), - outputSchema: z.void(), - }, - async (filePath: string) => { - filePath = path.resolve(filePath); - - // Read the pdf. - const pdfTxt = await ai.run('extract-text', () => - extractTextFromPdf(filePath) - ); - - // Divide the pdf text into segments. - const chunks = await ai.run('chunk-it', async () => - chunk(pdfTxt, chunkingConfig) - ); - - // Convert chunks of text into documents to store in the index. - const documents = chunks.map((text) => { - return Document.fromText(text, { filePath }); - }); - - // Add documents to the index. - await ai.index({ - indexer: menuPdfIndexer, - documents, - options: { - writeMode: WriteMode.Overwrite, - } as any - }); - } -); -``` - -Screenshot 2025-05-11 at 8 35 56 PM - -In your console, you can see the logs - -Screenshot 2025-05-11 at 7 19 14 PM - -### Creating custom retriever flows -You can also create custom retriever flows, utilizing more options and features provided by LanceDB. -```ts -export const menuRetriever = lancedbRetrieverRef({ - tableName: "table", // Use the same table name as the indexer. - displayName: "Menu", // Use a custom display name. - -export const menuQAFlow = ai.defineFlow( - { name: "Menu", inputSchema: z.string(), outputSchema: z.string() }, - async (input: string) => { - // retrieve relevant documents - const docs = await ai.retrieve({ - retriever: menuRetriever, - query: input, - options: { - k: 3, - }, - }); - - const extractedContent = docs.map(doc => { - if (doc.content && Array.isArray(doc.content) && doc.content.length > 0) { - if (doc.content[0].media && doc.content[0].media.url) { - return doc.content[0].media.url; - } - } - return "No content found"; - }); - - console.log("Extracted content:", extractedContent); - - const { text } = await ai.generate({ - model: gemini('gemini-2.0-flash'), - prompt: ` -You are acting as a helpful AI assistant that can answer -questions about the food available on the menu at Genkit Grub Pub. - -Use only the context provided to answer the question. -If you don't know, do not make up an answer. -Do not add or change items on the menu. - -Context: -${extractedContent.join('\n\n')} - -Question: ${input}`, - docs, - }); - - return text; - } -); -``` -Now using our retrieval flow, we can ask question about the ingsted PDF -Screenshot 2025-05-11 at 7 18 45 PM - diff --git a/docs/src/integrations/index.md b/docs/src/integrations/index.md deleted file mode 100644 index 6cab115d..00000000 --- a/docs/src/integrations/index.md +++ /dev/null @@ -1,19 +0,0 @@ -# Integrations - -LanceDB supports ingesting from and exporting to your favorite data formats across the Python and JavaScript ecosystems. - -![Illustration](../assets/ecosystem-illustration.png) - - -## Tools - -LanceDB is integrated with a lot of popular AI tools, with more coming soon. -Get started using these examples and quick links. - -| Integrations | | -|---|---:| -|

LlamaIndex

LlamaIndex is a simple, flexible data framework for connecting custom data sources to large language models. Llama index integrates with LanceDB as the serverless VectorDB.

[Lean More](https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html)

|image| -|

Langchain

Langchain allows building applications with LLMs through composability

[Lean More](https://lancedb.github.io/lancedb/integrations/langchain/) | image| -|

Langchain TS

Javascript bindings for Langchain. It integrates with LanceDB's serverless vectordb allowing you to build powerful AI applications through composibility using only serverless functions.

[Learn More]( https://js.langchain.com/docs/modules/data_connection/vectorstores/integrations/lancedb) | image| -|

Voxel51

It is an open source toolkit that enables you to build better computer vision workflows by improving the quality of your datasets and delivering insights about your models.

[Learn More](./voxel51.md) | image| -|

PromptTools

Offers a set of free, open-source tools for testing and experimenting with models, prompts, and configurations. The core idea is to enable developers to evaluate prompts using familiar interfaces like code and notebooks. You can use it to experiment with different configurations of LanceDB, and test how LanceDB integrates with the LLM of your choice.

[Learn More](./prompttools.md) | image| diff --git a/docs/src/integrations/langchain.md b/docs/src/integrations/langchain.md deleted file mode 100644 index b0a6196c..00000000 --- a/docs/src/integrations/langchain.md +++ /dev/null @@ -1,266 +0,0 @@ -**LangChain** is a framework designed for building applications with large language models (LLMs) by chaining together various components. It supports a range of functionalities including memory, agents, and chat models, enabling developers to create context-aware applications. - -![Illustration](https://raw.githubusercontent.com/lancedb/assets/refs/heads/main/docs/assets/integration/langchain_rag.png) - -LangChain streamlines these stages (in figure above) by providing pre-built components and tools for integration, memory management, and deployment, allowing developers to focus on application logic rather than underlying complexities. - -Integration of **Langchain** with **LanceDB** enables applications to retrieve the most relevant data by comparing query vectors against stored vectors, facilitating effective information retrieval. It results in better and context aware replies and actions by the LLMs. - -## Quick Start -You can load your document data using langchain's loaders, for this example we are using `TextLoader` and `OpenAIEmbeddings` as the embedding model. Checkout Complete example here - [LangChain demo](../notebooks/langchain_example.ipynb) -```python -import os -from langchain.document_loaders import TextLoader -from langchain.vectorstores import LanceDB -from langchain_openai import OpenAIEmbeddings -from langchain_text_splitters import CharacterTextSplitter - -os.environ["OPENAI_API_KEY"] = "sk-..." - -loader = TextLoader("../../modules/state_of_the_union.txt") # Replace with your data path -documents = loader.load() - -documents = CharacterTextSplitter().split_documents(documents) -embeddings = OpenAIEmbeddings() - -docsearch = LanceDB.from_documents(documents, embeddings) -query = "What did the president say about Ketanji Brown Jackson" -docs = docsearch.similarity_search(query) -print(docs[0].page_content) -``` - -## Documentation -In the above example `LanceDB` vector store class object is created using `from_documents()` method which is a `classmethod` and returns the initialized class object. - -You can also use `LanceDB.from_texts(texts: List[str],embedding: Embeddings)` class method. - -The exhaustive list of parameters for `LanceDB` vector store are : - -|Name|type|Purpose|default| -|:----|:----|:----|:----| -|`connection`| (Optional) `Any` |`lancedb.db.LanceDBConnection` connection object to use. If not provided, a new connection will be created.|`None`| -|`embedding`| (Optional) `Embeddings` | Langchain embedding model.|Provided by user.| -|`uri`| (Optional) `str` |It specifies the directory location of **LanceDB database** and establishes a connection that can be used to interact with the database. |`/tmp/lancedb`| -|`vector_key` |(Optional) `str`| Column name to use for vector's in the table.|`'vector'`| -|`id_key` |(Optional) `str`| Column name to use for id's in the table.|`'id'`| -|`text_key` |(Optional) `str` |Column name to use for text in the table.|`'text'`| -|`table_name` |(Optional) `str`| Name of your table in the database.|`'vectorstore'`| -|`api_key` |(Optional `str`) |API key to use for LanceDB cloud database.|`None`| -|`region` |(Optional) `str`| Region to use for LanceDB cloud database.|Only for LanceDB Cloud : `None`.| -|`mode` |(Optional) `str` |Mode to use for adding data to the table. Valid values are "append" and "overwrite".|`'overwrite'`| -|`table`| (Optional) `Any`|You can connect to an existing table of LanceDB, created outside of langchain, and utilize it.|`None`| -|`distance`|(Optional) `str`|The choice of distance metric used to calculate the similarity between vectors.|`'l2'`| -|`reranker` |(Optional) `Any`|The reranker to use for LanceDB.|`None`| -|`relevance_score_fn` |(Optional) `Callable[[float], float]` | Langchain relevance score function to be used.|`None`| -|`limit`|`int`|Set the maximum number of results to return.|`DEFAULT_K` (it is 4)| - -```python -db_url = "db://lang_test" # url of db you created -api_key = "xxxxx" # your API key -region="us-east-1-dev" # your selected region - -vector_store = LanceDB( - uri=db_url, - api_key=api_key, #(dont include for local API) - region=region, #(dont include for local API) - embedding=embeddings, - table_name='langchain_test' # Optional - ) -``` - -### Methods - -##### add_texts() - -This method turn texts into embedding and add it to the database. - -|Name|Purpose|defaults| -|:---|:---|:---| -|`texts`|`Iterable` of strings to add to the vectorstore.|Provided by user| -|`metadatas`|Optional `list[dict()]` of metadatas associated with the texts.|`None`| -|`ids`|Optional `list` of ids to associate with the texts.|`None`| -|`kwargs`| Other keyworded arguments provided by the user. |-| - -It returns list of ids of the added texts. - -```python -vector_store.add_texts(texts = ['test_123'], metadatas =[{'source' :'wiki'}]) - -#Additionaly, to explore the table you can load it into a df or save it in a csv file: - -tbl = vector_store.get_table() -print("tbl:", tbl) -pd_df = tbl.to_pandas() -pd_df.to_csv("docsearch.csv", index=False) - -# you can also create a new vector store object using an older connection object: -vector_store = LanceDB(connection=tbl, embedding=embeddings) -``` - ------- - - -##### create_index() - -This method creates a scalar(for non-vector cols) or a vector index on a table. - -|Name|type|Purpose|defaults| -|:---|:---|:---|:---| -|`vector_col`|`Optional[str]`| Provide if you want to create index on a vector column. |`None`| -|`col_name`|`Optional[str]`| Provide if you want to create index on a non-vector column. |`None`| -|`metric`|`Optional[str]` |Provide the metric to use for vector index. choice of metrics: 'l2', 'dot', 'cosine'. |`l2`| -|`num_partitions`|`Optional[int]`|Number of partitions to use for the index.|`256`| -|`num_sub_vectors`|`Optional[int]` |Number of sub-vectors to use for the index.|`96`| -|`index_cache_size`|`Optional[int]` |Size of the index cache.|`None`| -|`name`|`Optional[str]` |Name of the table to create index on.|`None`| - -For index creation make sure your table has enough data in it. An ANN index is ususally not needed for datasets ~100K vectors. For large-scale (>1M) or higher dimension vectors, it is beneficial to create an ANN index. - -```python -# for creating vector index -vector_store.create_index(vector_col='vector', metric = 'cosine') - -# for creating scalar index(for non-vector columns) -vector_store.create_index(col_name='text') - -``` - ------- - -##### similarity_search() - -This method performs similarity search based on **text query**. - -| Name | Type | Purpose | Default | -|---------|----------------------|---------|---------| -| `query` | `str` | A `str` representing the text query that you want to search for in the vector store. | N/A | -| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` | -| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. | `None` | -| `fts` | `Optional[bool]` | It indicates whether to perform a full-text search (FTS). | `False` | -| `name` | `Optional[str]` | It is used for specifying the name of the table to query. If not provided, it uses the default table set during the initialization of the LanceDB instance. | `None` | -| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A | - -Return documents most similar to the query **without relevance scores**. - -```python -docs = docsearch.similarity_search(query) -print(docs[0].page_content) -``` - ------- - -##### similarity_search_by_vector() - -The method returns documents that are most similar to the specified **embedding (query) vector**. - -| Name | Type | Purpose | Default | -|-------------|---------------------------|---------|---------| -| `embedding` | `List[float]` | The embedding vector you want to use to search for similar documents in the vector store. | N/A | -| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` | -| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. | `None` | -| `name` | `Optional[str]` | It is used for specifying the name of the table to query. If not provided, it uses the default table set during the initialization of the LanceDB instance. | `None` | -| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A | - -**It does not provide relevance scores.** - -```python -docs = docsearch.similarity_search_by_vector(query) -print(docs[0].page_content) -``` - ------- - -##### similarity_search_with_score() - -Returns documents most similar to the **query string** along with their relevance scores. - -| Name | Type | Purpose | Default | -|----------|---------------------------|---------|---------| -| `query` | `str` |A `str` representing the text query you want to search for in the vector store. This query will be converted into an embedding using the specified embedding function. | N/A | -| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` | -| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. This allows you to narrow down the search results based on certain metadata attributes associated with the documents. | `None` | -| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A | - -It gets called by base class's `similarity_search_with_relevance_scores` which selects relevance score based on our `_select_relevance_score_fn`. - -```python -docs = docsearch.similarity_search_with_relevance_scores(query) -print("relevance score - ", docs[0][1]) -print("text- ", docs[0][0].page_content[:1000]) -``` - ------- - -##### similarity_search_by_vector_with_relevance_scores() - -Similarity search using **query vector**. - -| Name | Type | Purpose | Default | -|-------------|---------------------------|---------|---------| -| `embedding` | `List[float]` | The embedding vector you want to use to search for similar documents in the vector store. | N/A | -| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` | -| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. | `None` | -| `name` | `Optional[str]` | It is used for specifying the name of the table to query. | `None` | -| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A | - -The method returns documents most similar to the specified embedding (query) vector, along with their relevance scores. - -```python -docs = docsearch.similarity_search_by_vector_with_relevance_scores(query_embedding) -print("relevance score - ", docs[0][1]) -print("text- ", docs[0][0].page_content[:1000]) -``` - ------- - -##### max_marginal_relevance_search() - -This method returns docs selected using the maximal marginal relevance(MMR). -Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. - -| Name | Type | Purpose | Default | -|---------------|-----------------|-----------|---------| -| `query` | `str` | Text to look up documents similar to. | N/A | -| `k` | `Optional[int]` | Number of Documents to return.| `4` | -| `fetch_k`| `Optional[int]`| Number of Documents to fetch to pass to MMR algorithm.| `None` | -| `lambda_mult` | `float` | Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. | `0.5` | -| `filter`| `Optional[Dict[str, str]]`| Filter by metadata. | `None` | -|`kwargs`| Other keyworded arguments provided by the user. |-| - -Similarly, `max_marginal_relevance_search_by_vector()` function returns docs most similar to the embedding passed to the function using MMR. instead of a string query you need to pass the embedding to be searched for. - -```python -result = docsearch.max_marginal_relevance_search( - query="text" - ) -result_texts = [doc.page_content for doc in result] -print(result_texts) - -## search by vector : -result = docsearch.max_marginal_relevance_search_by_vector( - embeddings.embed_query("text") - ) -result_texts = [doc.page_content for doc in result] -print(result_texts) -``` - ------- - -##### add_images() - -This method ddds images by automatically creating their embeddings and adds them to the vectorstore. - -| Name | Type | Purpose | Default | -|------------|-------------------------------|--------------------------------|---------| -| `uris` | `List[str]` | File path to the image | N/A | -| `metadatas`| `Optional[List[dict]]` | Optional list of metadatas | `None` | -| `ids` | `Optional[List[str]]` | Optional list of IDs | `None` | - -It returns list of IDs of the added images. - -```python -vec_store.add_images(uris=image_uris) -# here image_uris are local fs paths to the images. -``` - - diff --git a/docs/src/integrations/llamaIndex.md b/docs/src/integrations/llamaIndex.md deleted file mode 100644 index e608b83f..00000000 --- a/docs/src/integrations/llamaIndex.md +++ /dev/null @@ -1,142 +0,0 @@ -# Llama-Index -![Illustration](../assets/llama-index.jpg) - -## Quick start -You would need to install the integration via `pip install llama-index-vector-stores-lancedb` in order to use it. -You can run the below script to try it out : -```python -import logging -import sys - -# Uncomment to see debug logs -# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) - -from llama_index.core import SimpleDirectoryReader, Document, StorageContext -from llama_index.core import VectorStoreIndex -from llama_index.vector_stores.lancedb import LanceDBVectorStore -import textwrap -import openai - -openai.api_key = "sk-..." - -documents = SimpleDirectoryReader("./data/your-data-dir/").load_data() -print("Document ID:", documents[0].doc_id, "Document Hash:", documents[0].hash) - -## For LanceDB cloud : -# vector_store = LanceDBVectorStore( -# uri="db://db_name", # your remote DB URI -# api_key="sk_..", # lancedb cloud api key -# region="your-region" # the region you configured -# ... -# ) - -vector_store = LanceDBVectorStore( - uri="./lancedb", mode="overwrite", query_type="vector" -) -storage_context = StorageContext.from_defaults(vector_store=vector_store) - -index = VectorStoreIndex.from_documents( - documents, storage_context=storage_context -) -lance_filter = "metadata.file_name = 'paul_graham_essay.txt' " -retriever = index.as_retriever(vector_store_kwargs={"where": lance_filter}) -response = retriever.retrieve("What did the author do growing up?") -``` - -Checkout Complete example here - [LlamaIndex demo](../notebooks/LlamaIndex_example.ipynb) - -### Filtering -For metadata filtering, you can use a Lance SQL-like string filter as demonstrated in the example above. Additionally, you can also filter using the `MetadataFilters` class from LlamaIndex: -```python -from llama_index.core.vector_stores import ( - MetadataFilters, - FilterOperator, - FilterCondition, - MetadataFilter, -) - -query_filters = MetadataFilters( - filters=[ - MetadataFilter( - key="creation_date", operator=FilterOperator.EQ, value="2024-05-23" - ), - MetadataFilter( - key="file_size", value=75040, operator=FilterOperator.GT - ), - ], - condition=FilterCondition.AND, -) -``` - -### Hybrid Search -For complete documentation, refer [here](https://lancedb.github.io/lancedb/hybrid_search/hybrid_search/). This example uses the `colbert` reranker. Make sure to install necessary dependencies for the reranker you choose. -```python -from lancedb.rerankers import ColbertReranker - -reranker = ColbertReranker() -vector_store._add_reranker(reranker) - -query_engine = index.as_query_engine( - filters=query_filters, - vector_store_kwargs={ - "query_type": "hybrid", - } -) - -response = query_engine.query("How much did Viaweb charge per month?") -``` - -In the above snippet, you can change/specify query_type again when creating the engine/retriever. - -## API reference -The exhaustive list of parameters for `LanceDBVectorStore` vector store are : -- `connection`: Optional, `lancedb.db.LanceDBConnection` connection object to use. If not provided, a new connection will be created. -- `uri`: Optional[str], the uri of your database. Defaults to `"/tmp/lancedb"`. -- `table_name` : Optional[str], Name of your table in the database. Defaults to `"vectors"`. -- `table`: Optional[Any], `lancedb.db.LanceTable` object to be passed. Defaults to `None`. -- `vector_column_name`: Optional[Any], Column name to use for vector's in the table. Defaults to `'vector'`. -- `doc_id_key`: Optional[str], Column name to use for document id's in the table. Defaults to `'doc_id'`. -- `text_key`: Optional[str], Column name to use for text in the table. Defaults to `'text'`. -- `api_key`: Optional[str], API key to use for LanceDB cloud database. Defaults to `None`. -- `region`: Optional[str], Region to use for LanceDB cloud database. Only for LanceDB Cloud, defaults to `None`. -- `nprobes` : Optional[int], Set the number of probes to use. Only applicable if ANN index is created on the table else its ignored. Defaults to `20`. -- `refine_factor` : Optional[int], Refine the results by reading extra elements and re-ranking them in memory. Defaults to `None`. -- `reranker`: Optional[Any], The reranker to use for LanceDB. - Defaults to `None`. -- `overfetch_factor`: Optional[int], The factor by which to fetch more results. - Defaults to `1`. -- `mode`: Optional[str], The mode to use for LanceDB. - Defaults to `"overwrite"`. -- `query_type`:Optional[str], The type of query to use for LanceDB. - Defaults to `"vector"`. - - -### Methods - -- __from_table(cls, table: lancedb.db.LanceTable) -> `LanceDBVectorStore`__ : (class method) Creates instance from lancedb table. - -- **_add_reranker(self, reranker: lancedb.rerankers.Reranker) -> `None`** : Add a reranker to an existing vector store. - - Usage : - ```python - from lancedb.rerankers import ColbertReranker - reranker = ColbertReranker() - vector_store._add_reranker(reranker) - ``` -- **_table_exists(self, tbl_name: `Optional[str]` = `None`) -> `bool`** : Returns `True` if `tbl_name` exists in database. -- __create_index( - self, scalar: `Optional[bool]` = False, col_name: `Optional[str]` = None, num_partitions: `Optional[int]` = 256, num_sub_vectors: `Optional[int]` = 96, index_cache_size: `Optional[int]` = None, metric: `Optional[str]` = "l2", -) -> `None`__ : Creates a scalar(for non-vector cols) or a vector index on a table. - Make sure your vector column has enough data before creating an index on it. - -- __add(self, nodes: `List[BaseNode]`, **add_kwargs: `Any`, ) -> `List[str]`__ : -adds Nodes to the table - -- **delete(self, ref_doc_id: `str`) -> `None`**: Delete nodes using with node_ids. -- **delete_nodes(self, node_ids: `List[str]`) -> `None`** : Delete nodes using with node_ids. -- __query( - self, - query: `VectorStoreQuery`, - **kwargs: `Any`, - ) -> `VectorStoreQueryResult`__: - Query index(`VectorStoreIndex`) for top k most similar nodes. Accepts llamaIndex `VectorStoreQuery` object. \ No newline at end of file diff --git a/docs/src/integrations/phidata.md b/docs/src/integrations/phidata.md deleted file mode 100644 index fbabcf70..00000000 --- a/docs/src/integrations/phidata.md +++ /dev/null @@ -1,383 +0,0 @@ -**phidata** is a framework for building **AI Assistants** with long-term memory, contextual knowledge, and the ability to take actions using function calling. It helps turn general-purpose LLMs into specialized assistants tailored to your use case by extending its capabilities using **memory**, **knowledge**, and **tools**. - -- **Memory**: Stores chat history in a **database** and enables LLMs to have long-term conversations. -- **Knowledge**: Stores information in a **vector database** and provides LLMs with business context. (Here we will use LanceDB) -- **Tools**: Enable LLMs to take actions like pulling data from an **API**, **sending emails** or **querying a database**, etc. - -![example](https://raw.githubusercontent.com/lancedb/assets/refs/heads/main/docs/assets/integration/phidata_assistant.png) - -Memory & knowledge make LLMs smarter while tools make them autonomous. - -LanceDB is a vector database and its integration into phidata makes it easy for us to provide a **knowledge base** to LLMs. It enables us to store information as [embeddings](../embeddings/understanding_embeddings.md) and search for the **results** similar to ours using **query**. - -??? Question "What is Knowledge Base?" - Knowledge Base is a database of information that the Assistant can search to improve its responses. This information is stored in a vector database and provides LLMs with business context, which makes them respond in a context-aware manner. - - While any type of storage can act as a knowledge base, vector databases offer the best solution for retrieving relevant results from dense information quickly. - -Let's see how using LanceDB inside phidata helps in making LLM more useful: - -## Prerequisites: install and import necessary dependencies - -**Create a virtual environment** - -1. install virtualenv package - ```python - pip install virtualenv - ``` -2. Create a directory for your project and go to the directory and create a virtual environment inside it. - ```python - mkdir phi - ``` - ```python - cd phi - ``` - ```python - python -m venv phidata_ - ``` - -**Activating virtual environment** - -1. from inside the project directory, run the following command to activate the virtual environment. - ```python - phidata_/Scripts/activate - ``` - -**Install the following packages in the virtual environment** -```python -pip install lancedb phidata youtube_transcript_api openai ollama numpy pandas -``` - -**Create python files and import necessary libraries** - -You need to create two files - `transcript.py` and `ollama_assistant.py` or `openai_assistant.py` - -=== "openai_assistant.py" - - ```python - import os, openai - from rich.prompt import Prompt - from phi.assistant import Assistant - from phi.knowledge.text import TextKnowledgeBase - from phi.vectordb.lancedb import LanceDb - from phi.llm.openai import OpenAIChat - from phi.embedder.openai import OpenAIEmbedder - from transcript import extract_transcript - - if "OPENAI_API_KEY" not in os.environ: - # OR set the key here as a variable - openai.api_key = "sk-..." - - # The code below creates a file "transcript.txt" in the directory, the txt file will be used below - youtube_url = "https://www.youtube.com/watch?v=Xs33-Gzl8Mo" - segment_duration = 20 - transcript_text,dict_transcript = extract_transcript(youtube_url,segment_duration) - ``` - -=== "ollama_assistant.py" - - ```python - from rich.prompt import Prompt - from phi.assistant import Assistant - from phi.knowledge.text import TextKnowledgeBase - from phi.vectordb.lancedb import LanceDb - from phi.llm.ollama import Ollama - from phi.embedder.ollama import OllamaEmbedder - from transcript import extract_transcript - - # The code below creates a file "transcript.txt" in the directory, the txt file will be used below - youtube_url = "https://www.youtube.com/watch?v=Xs33-Gzl8Mo" - segment_duration = 20 - transcript_text,dict_transcript = extract_transcript(youtube_url,segment_duration) - ``` - -=== "transcript.py" - - ``` python - from youtube_transcript_api import YouTubeTranscriptApi - import re - - def smodify(seconds): - hours, remainder = divmod(seconds, 3600) - minutes, seconds = divmod(remainder, 60) - return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}" - - def extract_transcript(youtube_url,segment_duration): - # Extract video ID from the URL - video_id = re.search(r'(?<=v=)[\w-]+', youtube_url) - if not video_id: - video_id = re.search(r'(?<=be/)[\w-]+', youtube_url) - if not video_id: - return None - - video_id = video_id.group(0) - - # Attempt to fetch the transcript - try: - # Try to get the official transcript - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en']) - except Exception: - # If no official transcript is found, try to get auto-generated transcript - try: - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) - for transcript in transcript_list: - transcript = transcript.translate('en').fetch() - except Exception: - return None - - # Format the transcript into 120s chunks - transcript_text,dict_transcript = format_transcript(transcript,segment_duration) - # Open the file in write mode, which creates it if it doesn't exist - with open("transcript.txt", "w",encoding="utf-8") as file: - file.write(transcript_text) - return transcript_text,dict_transcript - - def format_transcript(transcript,segment_duration): - chunked_transcript = [] - chunk_dict = [] - current_chunk = [] - current_time = 0 - # 2 minutes in seconds - start_time_chunk = 0 # To track the start time of the current chunk - - for segment in transcript: - start_time = segment['start'] - end_time_x = start_time + segment['duration'] - text = segment['text'] - - # Add text to the current chunk - current_chunk.append(text) - - # Update the current time with the duration of the current segment - # The duration of the current segment is given by segment['start'] - start_time_chunk - if current_chunk: - current_time = start_time - start_time_chunk - - # If current chunk duration reaches or exceeds 2 minutes, save the chunk - if current_time >= segment_duration: - # Use the start time of the first segment in the current chunk as the timestamp - chunked_transcript.append(f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}] " + " ".join(current_chunk)) - current_chunk = re.sub(r'[\xa0\n]', lambda x: '' if x.group() == '\xa0' else ' ', "\n".join(current_chunk)) - chunk_dict.append({"timestamp":f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}]", "text": "".join(current_chunk)}) - current_chunk = [] # Reset the chunk - start_time_chunk = start_time + segment['duration'] # Update the start time for the next chunk - current_time = 0 # Reset current time - - # Add any remaining text in the last chunk - if current_chunk: - chunked_transcript.append(f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}] " + " ".join(current_chunk)) - current_chunk = re.sub(r'[\xa0\n]', lambda x: '' if x.group() == '\xa0' else ' ', "\n".join(current_chunk)) - chunk_dict.append({"timestamp":f"[{smodify(start_time_chunk)} to {smodify(end_time_x)}]", "text": "".join(current_chunk)}) - - return "\n\n".join(chunked_transcript), chunk_dict - ``` - -!!! warning - If creating Ollama assistant, download and install Ollama [from here](https://ollama.com/) and then run the Ollama instance in the background. Also, download the required models using `ollama pull `. Check out the models [here](https://ollama.com/library) - - -**Run the following command to deactivate the virtual environment if needed** -```python -deactivate -``` - -## **Step 1** - Create a Knowledge Base for AI Assistant using LanceDB - -=== "openai_assistant.py" - - ```python - # Create knowledge Base with OpenAIEmbedder in LanceDB - knowledge_base = TextKnowledgeBase( - path="transcript.txt", - vector_db=LanceDb( - embedder=OpenAIEmbedder(api_key = openai.api_key), - table_name="transcript_documents", - uri="./t3mp/.lancedb", - ), - num_documents = 10 - ) - ``` - -=== "ollama_assistant.py" - - ```python - # Create knowledge Base with OllamaEmbedder in LanceDB - knowledge_base = TextKnowledgeBase( - path="transcript.txt", - vector_db=LanceDb( - embedder=OllamaEmbedder(model="nomic-embed-text",dimensions=768), - table_name="transcript_documents", - uri="./t2mp/.lancedb", - ), - num_documents = 10 - ) - ``` -Check out the list of **embedders** supported by **phidata** and their usage [here](https://docs.phidata.com/embedder/introduction). - -Here we have used `TextKnowledgeBase`, which loads text/docx files to the knowledge base. - -Let's see all the parameters that `TextKnowledgeBase` takes - - -| Name| Type | Purpose | Default | -|:----|:-----|:--------|:--------| -|`path`|`Union[str, Path]`| Path to text file(s). It can point to a single text file or a directory of text files.| provided by user | -|`formats`|`List[str]`| File formats accepted by this knowledge base. |`[".txt"]`| -|`vector_db`|`VectorDb`| Vector Database for the Knowledge Base. phidata provides a wrapper around many vector DBs, you can import it like this - `from phi.vectordb.lancedb import LanceDb` | provided by user | -|`num_documents`|`int`| Number of results (documents/vectors) that vector search should return. |`5`| -|`reader`|`TextReader`| phidata provides many types of reader objects which read data, clean it and create chunks of data, encapsulate each chunk inside an object of the `Document` class, and return **`List[Document]`**. | `TextReader()` | -|`optimize_on`|`int`| It is used to specify the number of documents on which to optimize the vector database. Supposed to create an index. |`1000`| - -??? Tip "Wonder! What is `Document` class?" - We know that, before storing the data in vectorDB, we need to split the data into smaller chunks upon which embeddings will be created and these embeddings along with the chunks will be stored in vectorDB. When the user queries over the vectorDB, some of these embeddings will be returned as the result based on the semantic similarity with the query. - - When the user queries over vectorDB, the queries are converted into embeddings, and a nearest neighbor search is performed over these query embeddings which returns the embeddings that correspond to most semantically similar chunks(parts of our data) present in vectorDB. - - Here, a “Document” is a class in phidata. Since there is an option to let phidata create and manage embeddings, it splits our data into smaller chunks(as expected). It does not directly create embeddings on it. Instead, it takes each chunk and encapsulates it inside the object of the `Document` class along with various other metadata related to the chunk. Then embeddings are created on these `Document` objects and stored in vectorDB. - - ```python - class Document(BaseModel): - """Model for managing a document""" - - content: str # <--- here data of chunk is stored - id: Optional[str] = None - name: Optional[str] = None - meta_data: Dict[str, Any] = {} - embedder: Optional[Embedder] = None - embedding: Optional[List[float]] = None - usage: Optional[Dict[str, Any]] = None - ``` - -However, using phidata you can load many other types of data in the knowledge base(other than text). Check out [phidata Knowledge Base](https://docs.phidata.com/knowledge/introduction) for more information. - -Let's dig deeper into the `vector_db` parameter and see what parameters `LanceDb` takes - - -| Name| Type | Purpose | Default | -|:----|:-----|:--------|:--------| -|`embedder`|`Embedder`| phidata provides many Embedders that abstract the interaction with embedding APIs and utilize it to generate embeddings. Check out other embedders [here](https://docs.phidata.com/embedder/introduction) | `OpenAIEmbedder` | -|`distance`|`List[str]`| The choice of distance metric used to calculate the similarity between vectors, which directly impacts search results and performance in vector databases. |`Distance.cosine`| -|`connection`|`lancedb.db.LanceTable`| LanceTable can be accessed through `.connection`. You can connect to an existing table of LanceDB, created outside of phidata, and utilize it. If not provided, it creates a new table using `table_name` parameter and adds it to `connection`. |`None`| -|`uri`|`str`| It specifies the directory location of **LanceDB database** and establishes a connection that can be used to interact with the database. | `"/tmp/lancedb"` | -|`table_name`|`str`| If `connection` is not provided, it initializes and connects to a new **LanceDB table** with a specified(or default) name in the database present at `uri`. |`"phi"`| -|`nprobes`|`int`| It refers to the number of partitions that the search algorithm examines to find the nearest neighbors of a given query vector. Higher values will yield better recall (more likely to find vectors if they exist) at the expense of latency. |`20`| - - -!!! note - Since we just initialized the KnowledgeBase. The VectorDB table that corresponds to this Knowledge Base is not yet populated with our data. It will be populated in **Step 3**, once we perform the `load` operation. - - You can check the state of the LanceDB table using - `knowledge_base.vector_db.connection.to_pandas()` - -Now that the Knowledge Base is initialized, , we can go to **step 2**. - -## **Step 2** - Create an assistant with our choice of LLM and reference to the knowledge base. - - -=== "openai_assistant.py" - - ```python - # define an assistant with gpt-4o-mini llm and reference to the knowledge base created above - assistant = Assistant( - llm=OpenAIChat(model="gpt-4o-mini", max_tokens=1000, temperature=0.3,api_key = openai.api_key), - description="""You are an Expert in explaining youtube video transcripts. You are a bot that takes transcript of a video and answer the question based on it. - - This is transcript for the above timestamp: {relevant_document} - The user input is: {user_input} - generate highlights only when asked. - When asked to generate highlights from the video, understand the context for each timestamp and create key highlight points, answer in following way - - [timestamp] - highlight 1 - [timestamp] - highlight 2 - ... so on - - Your task is to understand the user question, and provide an answer using the provided contexts. Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state,'The provided context does not have the answer.'""", - knowledge_base=knowledge_base, - add_references_to_prompt=True, - ) - ``` - -=== "ollama_assistant.py" - - ```python - # define an assistant with llama3.1 llm and reference to the knowledge base created above - assistant = Assistant( - llm=Ollama(model="llama3.1"), - description="""You are an Expert in explaining youtube video transcripts. You are a bot that takes transcript of a video and answer the question based on it. - - This is transcript for the above timestamp: {relevant_document} - The user input is: {user_input} - generate highlights only when asked. - When asked to generate highlights from the video, understand the context for each timestamp and create key highlight points, answer in following way - - [timestamp] - highlight 1 - [timestamp] - highlight 2 - ... so on - - Your task is to understand the user question, and provide an answer using the provided contexts. Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state,'The provided context does not have the answer.'""", - knowledge_base=knowledge_base, - add_references_to_prompt=True, - ) - ``` - -Assistants add **memory**, **knowledge**, and **tools** to LLMs. Here we will add only **knowledge** in this example. - -Whenever we will give a query to LLM, the assistant will retrieve relevant information from our **Knowledge Base**(table in LanceDB) and pass it to LLM along with the user query in a structured way. - -- The `add_references_to_prompt=True` always adds information from the knowledge base to the prompt, regardless of whether it is relevant to the question. - -To know more about an creating assistant in phidata, check out [phidata docs](https://docs.phidata.com/assistants/introduction) here. - -## **Step 3** - Load data to Knowledge Base. - -```python -# load out data into the knowledge_base (populating the LanceTable) -assistant.knowledge_base.load(recreate=False) -``` -The above code loads the data to the Knowledge Base(LanceDB Table) and now it is ready to be used by the assistant. - -| Name| Type | Purpose | Default | -|:----|:-----|:--------|:--------| -|`recreate`|`bool`| If True, it drops the existing table and recreates the table in the vectorDB. |`False`| -|`upsert`|`bool`| If True and the vectorDB supports upsert, it will upsert documents to the vector db. | `False` | -|`skip_existing`|`bool`| If True, skips documents that already exist in the vectorDB when inserting. |`True`| - -??? tip "What is upsert?" - Upsert is a database operation that combines "update" and "insert". It updates existing records if a document with the same identifier does exist, or inserts new records if no matching record exists. This is useful for maintaining the most current information without manually checking for existence. - -During the Load operation, phidata directly interacts with the LanceDB library and performs the loading of the table with our data in the following steps - - -1. **Creates** and **initializes** the table if it does not exist. - -2. Then it **splits** our data into smaller **chunks**. - - ??? question "How do they create chunks?" - **phidata** provides many types of **Knowledge Bases** based on the type of data. Most of them :material-information-outline:{ title="except LlamaIndexKnowledgeBase and LangChainKnowledgeBase"} has a property method called `document_lists` of type `Iterator[List[Document]]`. During the load operation, this property method is invoked. It traverses on the data provided by us (in this case, a text file(s)) using `reader`. Then it **reads**, **creates chunks**, and **encapsulates** each chunk inside a `Document` object and yields **lists of `Document` objects** that contain our data. - -3. Then **embeddings** are created on these chunks are **inserted** into the LanceDB Table - - ??? question "How do they insert your data as different rows in LanceDB Table?" - The chunks of your data are in the form - **lists of `Document` objects**. It was yielded in the step above. - - for each `Document` in `List[Document]`, it does the following operations: - - - Creates embedding on `Document`. - - Cleans the **content attribute**(chunks of our data is here) of `Document`. - - Prepares data by creating `id` and loading `payload` with the metadata related to this chunk. (1) - { .annotate } - - 1. Three columns will be added to the table - `"id"`, `"vector"`, and `"payload"` (payload contains various metadata including **`content`**) - - - Then add this data to LanceTable. - -4. Now the internal state of `knowledge_base` is changed (embeddings are created and loaded in the table ) and it **ready to be used by assistant**. - -## **Step 4** - Start a cli chatbot with access to the Knowledge base - -```python -# start cli chatbot with knowledge base -assistant.print_response("Ask me about something from the knowledge base") -while True: - message = Prompt.ask(f"[bold] :sunglasses: User [/bold]") - if message in ("exit", "bye"): - break - assistant.print_response(message, markdown=True) -``` - - -For more information and amazing cookbooks of phidata, read the [phidata documentation](https://docs.phidata.com/introduction) and also visit [LanceDB x phidata docmentation](https://docs.phidata.com/vectordb/lancedb). \ No newline at end of file diff --git a/docs/src/integrations/prompttools.md b/docs/src/integrations/prompttools.md deleted file mode 100644 index 772d5072..00000000 --- a/docs/src/integrations/prompttools.md +++ /dev/null @@ -1,9 +0,0 @@ - -[PromptTools](https://github.com/hegelai/prompttools) offers a set of free, open-source tools for testing and experimenting with models, prompts, and configurations. The core idea is to enable developers to evaluate prompts using familiar interfaces like code and notebooks. You can use it to experiment with different configurations of LanceDB, and test how LanceDB integrates with the LLM of your choice. - - -Open In Colab - -![Alt text](https://prompttools.readthedocs.io/en/latest/_images/demo.gif "a title") - diff --git a/docs/src/integrations/voxel51.md b/docs/src/integrations/voxel51.md deleted file mode 100644 index 49f12863..00000000 --- a/docs/src/integrations/voxel51.md +++ /dev/null @@ -1,232 +0,0 @@ -# FiftyOne - -FiftyOne is an open source toolkit that enables users to curate better data and build better models. It includes tools for data exploration, visualization, and management, as well as features for collaboration and sharing. - -Any developers, data scientists, and researchers who work with computer vision and machine learning can use FiftyOne to improve the quality of their datasets and deliver insights about their models. - - -![example](../assets/voxel.gif) - -**FiftyOne** provides an API to create LanceDB tables and run similarity queries, both **programmatically in Python** and via **point-and-click in the App**. - -Let's get started and see how to use **LanceDB** to create a **similarity index** on your FiftyOne datasets. - -## Overview - -**[Embeddings](../embeddings/understanding_embeddings.md)** are foundational to all of the **vector search** features. In FiftyOne, embeddings are managed by the [**FiftyOne Brain**](https://docs.voxel51.com/user_guide/brain.html) that provides powerful machine learning techniques designed to transform how you curate your data from an art into a measurable science. - -!!!question "Have you ever wanted to find the images most similar to an image in your dataset?" - The **FiftyOne Brain** makes computing **visual similarity** really easy. You can compute the similarity of samples in your dataset using an embedding model and store the results in the **brain key**. - - You can then sort your samples by similarity or use this information to find potential duplicate images. - -Here we will be doing the following : - -1. **Create Index** - In order to run similarity queries against our media, we need to **index** the data. We can do this via the `compute_similarity()` function. - - - In the function, specify the **model** you want to use to generate the embedding vectors, and what **vector search engine** you want to use on the **backend** (here LanceDB). - - !!!tip - You can also give the similarity index a name(`brain_key`), which is useful if you want to run vector searches against multiple indexes. - -2. **Query** - Once you have generated your similarity index, you can query your dataset with `sort_by_similarity()`. The query can be any of the following: - - - An ID (sample or patch) - - A query vector of same dimension as the index - - A list of IDs (samples or patches) - - A text prompt (search semantically) - -## Prerequisites: install necessary dependencies - -1. **Create and activate a virtual environment** - - Install virtualenv package and run the following command in your project directory. - ```python - python -m venv fiftyone_ - ``` - From inside the project directory run the following to activate the virtual environment. - === "Windows" - - ```python - fiftyone_/Scripts/activate - ``` - - === "macOS/Linux" - - ```python - source fiftyone_/Scripts/activate - ``` - -2. **Install the following packages in the virtual environment** - - To install FiftyOne, ensure you have activated any virtual environment that you are using, then run - ```python - pip install fiftyone - ``` - - -## Understand basic workflow - -The basic workflow shown below uses LanceDB to create a similarity index on your FiftyOne datasets: - -1. Load a dataset into FiftyOne. - -2. Compute embedding vectors for samples or patches in your dataset, or select a model to use to generate embeddings. - -3. Use the `compute_similarity()` method to generate a LanceDB table for the samples or object patches embeddings in a dataset by setting the parameter `backend="lancedb"` and specifying a `brain_key` of your choice. - -4. Use this LanceDB table to query your data with `sort_by_similarity()`. - -5. If desired, delete the table. - -## Quick Example - -Let's jump on a quick example that demonstrates this workflow. - - -```python - -import fiftyone as fo -import fiftyone.brain as fob -import fiftyone.zoo as foz - -# Step 1: Load your data into FiftyOne -dataset = foz.load_zoo_dataset("quickstart") -``` -Make sure you install torch ([guide here](https://pytorch.org/get-started/locally/)) before proceeding. - -```python -# Steps 2 and 3: Compute embeddings and create a similarity index -lancedb_index = fob.compute_similarity( - dataset, - model="clip-vit-base32-torch", - brain_key="lancedb_index", - backend="lancedb", -) -``` - -!!! note - Running the code above will download the clip model (2.6Gb) - -Once the similarity index has been generated, we can query our data in FiftyOne by specifying the `brain_key`: - -```python -# Step 4: Query your data -query = dataset.first().id # query by sample ID -view = dataset.sort_by_similarity( - query, - brain_key="lancedb_index", - k=10, # limit to 10 most similar samples -) -``` -The returned result are of type - `DatasetView`. - -!!! note - `DatasetView` does not hold its contents in-memory. Views simply store the rule(s) that are applied to extract the content of interest from the underlying Dataset when the view is iterated/aggregated on. - - This means, for example, that the contents of a `DatasetView` may change as the underlying Dataset is modified. - -??? question "Can you query a view instead of dataset?" - Yes, you can also query a view. - - Performing a similarity search on a `DatasetView` will only return results from the view; if the view contains samples that were not included in the index, they will never be included in the result. - - This means that you can index an entire Dataset once and then perform searches on subsets of the dataset by constructing views that contain the images of interest. - -```python -# Step 5 (optional): Cleanup - -# Delete the LanceDB table -lancedb_index.cleanup() - -# Delete run record from FiftyOne -dataset.delete_brain_run("lancedb_index") -``` - - -## Using LanceDB backend -By default, calling `compute_similarity()` or `sort_by_similarity()` will use an sklearn backend. - -To use the LanceDB backend, simply set the optional `backend` parameter of `compute_similarity()` to `"lancedb"`: - -```python -import fiftyone.brain as fob -#... rest of the code -fob.compute_similarity(..., backend="lancedb", ...) -``` - -Alternatively, you can configure FiftyOne to use the LanceDB backend by setting the following environment variable. - -In your terminal, set the environment variable using: -=== "Windows" - - ```python - $Env:FIFTYONE_BRAIN_DEFAULT_SIMILARITY_BACKEND="lancedb" //powershell - - set FIFTYONE_BRAIN_DEFAULT_SIMILARITY_BACKEND=lancedb //cmd - ``` - -=== "macOS/Linux" - - ```python - export FIFTYONE_BRAIN_DEFAULT_SIMILARITY_BACKEND=lancedb - ``` - -!!! note - This will only run during the terminal session. Once terminal is closed, environment variable is deleted. - -Alternatively, you can **permanently** configure FiftyOne to use the LanceDB backend creating a `brain_config.json` at `~/.fiftyone/brain_config.json`. The JSON file may contain any desired subset of config fields that you wish to customize. - -```json -{ - "default_similarity_backend": "lancedb" -} -``` -This will override the default `brain_config` and will set it according to your customization. You can check the configuration by running the following code : - -```python -import fiftyone.brain as fob -# Print your current brain config -print(fob.brain_config) -``` - -## LanceDB config parameters - -The LanceDB backend supports query parameters that can be used to customize your similarity queries. These parameters include: - -| Name| Purpose | Default | -|:----|:--------|:--------| -|**table_name**|The name of the LanceDB table to use. If none is provided, a new table will be created|`None`| -|**metric**|The embedding distance metric to use when creating a new table. The supported values are ("cosine", "euclidean")|`"cosine"`| -|**uri**| The database URI to use. In this Database URI, tables will be created. |`"/tmp/lancedb"`| - -There are two ways to specify/customize the parameters: - -1. **Using `brain_config.json` file** - - ```json - { - "similarity_backends": { - "lancedb": { - "table_name": "your-table", - "metric": "euclidean", - "uri": "/tmp/lancedb" - } - } - } - ``` - -2. **Directly passing to `compute_similarity()` to configure a specific new index** : - - ```python - lancedb_index = fob.compute_similarity( - ... - backend="lancedb", - brain_key="lancedb_index", - table_name="your-table", - metric="euclidean", - uri="/tmp/lancedb", - ) - ``` - -For a much more in depth walkthrough of the integration, visit the LanceDB x Voxel51 [docs page](https://docs.voxel51.com/integrations/lancedb.html). diff --git a/docs/src/javascript/.nojekyll b/docs/src/javascript/.nojekyll deleted file mode 100644 index e2ac6616..00000000 --- a/docs/src/javascript/.nojekyll +++ /dev/null @@ -1 +0,0 @@ -TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false. \ No newline at end of file diff --git a/docs/src/javascript/README.md b/docs/src/javascript/README.md deleted file mode 100644 index 5c455510..00000000 --- a/docs/src/javascript/README.md +++ /dev/null @@ -1,65 +0,0 @@ -vectordb / [Exports](modules.md) - -# LanceDB - -A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb). - -## Installation - -```bash -npm install vectordb -``` - -This will download the appropriate native library for your platform. We currently -support: - -* Linux (x86_64 and aarch64) -* MacOS (Intel and ARM/M1/M2) -* Windows (x86_64 only) - -We do not yet support musl-based Linux (such as Alpine Linux) or aarch64 Windows. - -## Usage - -### Basic Example - -```javascript -const lancedb = require('vectordb'); -const db = await lancedb.connect('data/sample-lancedb'); -const table = await db.createTable("my_table", - [{ id: 1, vector: [0.1, 1.0], item: "foo", price: 10.0 }, - { id: 2, vector: [3.9, 0.5], item: "bar", price: 20.0 }]) -const results = await table.search([0.1, 0.3]).limit(20).execute(); -console.log(results); -``` - -The [examples](./examples) folder contains complete examples. - -## Development - -To build everything fresh: - -```bash -npm install -npm run build -``` - -Then you should be able to run the tests with: - -```bash -npm test -``` - -### Fix lints - -To run the linter and have it automatically fix all errors - -```bash -npm run lint -- --fix -``` - -To build documentation - -```bash -npx typedoc --plugin typedoc-plugin-markdown --out ../docs/src/javascript src/index.ts -``` diff --git a/docs/src/javascript/classes/DefaultWriteOptions.md b/docs/src/javascript/classes/DefaultWriteOptions.md deleted file mode 100644 index 2103dc98..00000000 --- a/docs/src/javascript/classes/DefaultWriteOptions.md +++ /dev/null @@ -1,41 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / DefaultWriteOptions - -# Class: DefaultWriteOptions - -Write options when creating a Table. - -## Implements - -- [`WriteOptions`](../interfaces/WriteOptions.md) - -## Table of contents - -### Constructors - -- [constructor](DefaultWriteOptions.md#constructor) - -### Properties - -- [writeMode](DefaultWriteOptions.md#writemode) - -## Constructors - -### constructor - -• **new DefaultWriteOptions**() - -## Properties - -### writeMode - -• **writeMode**: [`WriteMode`](../enums/WriteMode.md) = `WriteMode.Create` - -A [WriteMode](../enums/WriteMode.md) to use on this operation - -#### Implementation of - -[WriteOptions](../interfaces/WriteOptions.md).[writeMode](../interfaces/WriteOptions.md#writemode) - -#### Defined in - -[index.ts:1359](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1359) diff --git a/docs/src/javascript/classes/LocalConnection.md b/docs/src/javascript/classes/LocalConnection.md deleted file mode 100644 index cfb0a4b9..00000000 --- a/docs/src/javascript/classes/LocalConnection.md +++ /dev/null @@ -1,322 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / LocalConnection - -# Class: LocalConnection - -A connection to a LanceDB database. - -## Implements - -- [`Connection`](../interfaces/Connection.md) - -## Table of contents - -### Constructors - -- [constructor](LocalConnection.md#constructor) - -### Properties - -- [\_db](LocalConnection.md#_db) -- [\_options](LocalConnection.md#_options) - -### Accessors - -- [uri](LocalConnection.md#uri) - -### Methods - -- [createTable](LocalConnection.md#createtable) -- [createTableImpl](LocalConnection.md#createtableimpl) -- [dropTable](LocalConnection.md#droptable) -- [openTable](LocalConnection.md#opentable) -- [tableNames](LocalConnection.md#tablenames) -- [withMiddleware](LocalConnection.md#withmiddleware) - -## Constructors - -### constructor - -• **new LocalConnection**(`db`, `options`) - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `db` | `any` | -| `options` | [`ConnectionOptions`](../interfaces/ConnectionOptions.md) | - -#### Defined in - -[index.ts:739](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L739) - -## Properties - -### \_db - -• `Private` `Readonly` **\_db**: `any` - -#### Defined in - -[index.ts:737](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L737) - -___ - -### \_options - -• `Private` `Readonly` **\_options**: () => [`ConnectionOptions`](../interfaces/ConnectionOptions.md) - -#### Type declaration - -▸ (): [`ConnectionOptions`](../interfaces/ConnectionOptions.md) - -##### Returns - -[`ConnectionOptions`](../interfaces/ConnectionOptions.md) - -#### Defined in - -[index.ts:736](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L736) - -## Accessors - -### uri - -• `get` **uri**(): `string` - -#### Returns - -`string` - -#### Implementation of - -[Connection](../interfaces/Connection.md).[uri](../interfaces/Connection.md#uri) - -#### Defined in - -[index.ts:744](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L744) - -## Methods - -### createTable - -▸ **createTable**\<`T`\>(`name`, `data?`, `optsOrEmbedding?`, `opt?`): `Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -Creates a new Table, optionally initializing it with new data. - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `name` | `string` \| [`CreateTableOptions`](../interfaces/CreateTableOptions.md)\<`T`\> | -| `data?` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | -| `optsOrEmbedding?` | [`WriteOptions`](../interfaces/WriteOptions.md) \| [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> | -| `opt?` | [`WriteOptions`](../interfaces/WriteOptions.md) | - -#### Returns - -`Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -#### Implementation of - -[Connection](../interfaces/Connection.md).[createTable](../interfaces/Connection.md#createtable) - -#### Defined in - -[index.ts:788](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L788) - -___ - -### createTableImpl - -▸ `Private` **createTableImpl**\<`T`\>(`«destructured»`): `Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `«destructured»` | `Object` | -| › `data?` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | -| › `embeddingFunction?` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> | -| › `name` | `string` | -| › `schema?` | `Schema`\<`any`\> | -| › `writeOptions?` | [`WriteOptions`](../interfaces/WriteOptions.md) | - -#### Returns - -`Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -#### Defined in - -[index.ts:822](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L822) - -___ - -### dropTable - -▸ **dropTable**(`name`): `Promise`\<`void`\> - -Drop an existing table. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table to drop. | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Connection](../interfaces/Connection.md).[dropTable](../interfaces/Connection.md#droptable) - -#### Defined in - -[index.ts:876](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L876) - -___ - -### openTable - -▸ **openTable**(`name`): `Promise`\<[`Table`](../interfaces/Table.md)\<`number`[]\>\> - -Open a table in the database. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | - -#### Returns - -`Promise`\<[`Table`](../interfaces/Table.md)\<`number`[]\>\> - -#### Implementation of - -[Connection](../interfaces/Connection.md).[openTable](../interfaces/Connection.md#opentable) - -#### Defined in - -[index.ts:760](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L760) - -▸ **openTable**\<`T`\>(`name`, `embeddings`): `Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -Open a table in the database. - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | -| `embeddings` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> | An embedding function to use on this Table | - -#### Returns - -`Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -#### Implementation of - -Connection.openTable - -#### Defined in - -[index.ts:768](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L768) - -▸ **openTable**\<`T`\>(`name`, `embeddings?`): `Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `name` | `string` | -| `embeddings?` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> | - -#### Returns - -`Promise`\<[`Table`](../interfaces/Table.md)\<`T`\>\> - -#### Implementation of - -Connection.openTable - -#### Defined in - -[index.ts:772](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L772) - -___ - -### tableNames - -▸ **tableNames**(): `Promise`\<`string`[]\> - -Get the names of all tables in the database. - -#### Returns - -`Promise`\<`string`[]\> - -#### Implementation of - -[Connection](../interfaces/Connection.md).[tableNames](../interfaces/Connection.md#tablenames) - -#### Defined in - -[index.ts:751](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L751) - -___ - -### withMiddleware - -▸ **withMiddleware**(`middleware`): [`Connection`](../interfaces/Connection.md) - -Instrument the behavior of this Connection with middleware. - -The middleware will be called in the order they are added. - -Currently this functionality is only supported for remote Connections. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `middleware` | `HttpMiddleware` | - -#### Returns - -[`Connection`](../interfaces/Connection.md) - -- this Connection instrumented by the passed middleware - -#### Implementation of - -[Connection](../interfaces/Connection.md).[withMiddleware](../interfaces/Connection.md#withmiddleware) - -#### Defined in - -[index.ts:880](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L880) diff --git a/docs/src/javascript/classes/LocalTable.md b/docs/src/javascript/classes/LocalTable.md deleted file mode 100644 index 69b4a01c..00000000 --- a/docs/src/javascript/classes/LocalTable.md +++ /dev/null @@ -1,763 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / LocalTable - -# Class: LocalTable\ - -A LanceDB Table is the collection of Records. Each Record has one or more vector fields. - -## Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `number`[] | - -## Implements - -- [`Table`](../interfaces/Table.md)\<`T`\> - -## Table of contents - -### Constructors - -- [constructor](LocalTable.md#constructor) - -### Properties - -- [\_embeddings](LocalTable.md#_embeddings) -- [\_isElectron](LocalTable.md#_iselectron) -- [\_name](LocalTable.md#_name) -- [\_options](LocalTable.md#_options) -- [\_tbl](LocalTable.md#_tbl) -- [where](LocalTable.md#where) - -### Accessors - -- [name](LocalTable.md#name) -- [schema](LocalTable.md#schema) - -### Methods - -- [add](LocalTable.md#add) -- [addColumns](LocalTable.md#addcolumns) -- [alterColumns](LocalTable.md#altercolumns) -- [checkElectron](LocalTable.md#checkelectron) -- [cleanupOldVersions](LocalTable.md#cleanupoldversions) -- [compactFiles](LocalTable.md#compactfiles) -- [countRows](LocalTable.md#countrows) -- [createIndex](LocalTable.md#createindex) -- [createScalarIndex](LocalTable.md#createscalarindex) -- [delete](LocalTable.md#delete) -- [dropColumns](LocalTable.md#dropcolumns) -- [filter](LocalTable.md#filter) -- [getSchema](LocalTable.md#getschema) -- [indexStats](LocalTable.md#indexstats) -- [listIndices](LocalTable.md#listindices) -- [mergeInsert](LocalTable.md#mergeinsert) -- [overwrite](LocalTable.md#overwrite) -- [search](LocalTable.md#search) -- [update](LocalTable.md#update) -- [withMiddleware](LocalTable.md#withmiddleware) - -## Constructors - -### constructor - -• **new LocalTable**\<`T`\>(`tbl`, `name`, `options`) - -#### Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `number`[] | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `tbl` | `any` | -| `name` | `string` | -| `options` | [`ConnectionOptions`](../interfaces/ConnectionOptions.md) | - -#### Defined in - -[index.ts:892](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L892) - -• **new LocalTable**\<`T`\>(`tbl`, `name`, `options`, `embeddings`) - -#### Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `number`[] | - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `tbl` | `any` | | -| `name` | `string` | | -| `options` | [`ConnectionOptions`](../interfaces/ConnectionOptions.md) | | -| `embeddings` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> | An embedding function to use when interacting with this table | - -#### Defined in - -[index.ts:899](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L899) - -## Properties - -### \_embeddings - -• `Private` `Optional` `Readonly` **\_embeddings**: [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> - -#### Defined in - -[index.ts:889](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L889) - -___ - -### \_isElectron - -• `Private` `Readonly` **\_isElectron**: `boolean` - -#### Defined in - -[index.ts:888](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L888) - -___ - -### \_name - -• `Private` `Readonly` **\_name**: `string` - -#### Defined in - -[index.ts:887](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L887) - -___ - -### \_options - -• `Private` `Readonly` **\_options**: () => [`ConnectionOptions`](../interfaces/ConnectionOptions.md) - -#### Type declaration - -▸ (): [`ConnectionOptions`](../interfaces/ConnectionOptions.md) - -##### Returns - -[`ConnectionOptions`](../interfaces/ConnectionOptions.md) - -#### Defined in - -[index.ts:890](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L890) - -___ - -### \_tbl - -• `Private` **\_tbl**: `any` - -#### Defined in - -[index.ts:886](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L886) - -___ - -### where - -• **where**: (`value`: `string`) => [`Query`](Query.md)\<`T`\> - -#### Type declaration - -▸ (`value`): [`Query`](Query.md)\<`T`\> - -Creates a filter query to find all rows matching the specified criteria - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `string` | The filter criteria (like SQL where clause syntax) | - -##### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[index.ts:938](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L938) - -## Accessors - -### name - -• `get` **name**(): `string` - -#### Returns - -`string` - -#### Implementation of - -[Table](../interfaces/Table.md).[name](../interfaces/Table.md#name) - -#### Defined in - -[index.ts:918](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L918) - -___ - -### schema - -• `get` **schema**(): `Promise`\<`Schema`\<`any`\>\> - -#### Returns - -`Promise`\<`Schema`\<`any`\>\> - -#### Implementation of - -[Table](../interfaces/Table.md).[schema](../interfaces/Table.md#schema) - -#### Defined in - -[index.ts:1171](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1171) - -## Methods - -### add - -▸ **add**(`data`): `Promise`\<`number`\> - -Insert records into this Table. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Records to be inserted into the Table | - -#### Returns - -`Promise`\<`number`\> - -The number of rows added to the table - -#### Implementation of - -[Table](../interfaces/Table.md).[add](../interfaces/Table.md#add) - -#### Defined in - -[index.ts:946](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L946) - -___ - -### addColumns - -▸ **addColumns**(`newColumnTransforms`): `Promise`\<`void`\> - -Add new columns with defined values. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `newColumnTransforms` | \{ `name`: `string` ; `valueSql`: `string` }[] | pairs of column names and the SQL expression to use to calculate the value of the new column. These expressions will be evaluated for each row in the table, and can reference existing columns in the table. | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[addColumns](../interfaces/Table.md#addcolumns) - -#### Defined in - -[index.ts:1195](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1195) - -___ - -### alterColumns - -▸ **alterColumns**(`columnAlterations`): `Promise`\<`void`\> - -Alter the name or nullability of columns. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `columnAlterations` | [`ColumnAlteration`](../interfaces/ColumnAlteration.md)[] | One or more alterations to apply to columns. | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[alterColumns](../interfaces/Table.md#altercolumns) - -#### Defined in - -[index.ts:1201](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1201) - -___ - -### checkElectron - -▸ `Private` **checkElectron**(): `boolean` - -#### Returns - -`boolean` - -#### Defined in - -[index.ts:1183](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1183) - -___ - -### cleanupOldVersions - -▸ **cleanupOldVersions**(`olderThan?`, `deleteUnverified?`): `Promise`\<[`CleanupStats`](../interfaces/CleanupStats.md)\> - -Clean up old versions of the table, freeing disk space. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `olderThan?` | `number` | The minimum age in minutes of the versions to delete. If not provided, defaults to two weeks. | -| `deleteUnverified?` | `boolean` | Because they may be part of an in-progress transaction, uncommitted files newer than 7 days old are not deleted by default. This means that failed transactions can leave around data that takes up disk space for up to 7 days. You can override this safety mechanism by setting this option to `true`, only if you promise there are no in progress writes while you run this operation. Failure to uphold this promise can lead to corrupted tables. | - -#### Returns - -`Promise`\<[`CleanupStats`](../interfaces/CleanupStats.md)\> - -#### Defined in - -[index.ts:1130](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1130) - -___ - -### compactFiles - -▸ **compactFiles**(`options?`): `Promise`\<[`CompactionMetrics`](../interfaces/CompactionMetrics.md)\> - -Run the compaction process on the table. - -This can be run after making several small appends to optimize the table -for faster reads. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `options?` | [`CompactionOptions`](../interfaces/CompactionOptions.md) | Advanced options configuring compaction. In most cases, you can omit this arguments, as the default options are sensible for most tables. | - -#### Returns - -`Promise`\<[`CompactionMetrics`](../interfaces/CompactionMetrics.md)\> - -Metrics about the compaction operation. - -#### Defined in - -[index.ts:1153](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1153) - -___ - -### countRows - -▸ **countRows**(`filter?`): `Promise`\<`number`\> - -Returns the number of rows in this table. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `filter?` | `string` | - -#### Returns - -`Promise`\<`number`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[countRows](../interfaces/Table.md#countrows) - -#### Defined in - -[index.ts:1021](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1021) - -___ - -### createIndex - -▸ **createIndex**(`indexParams`): `Promise`\<`any`\> - -Create an ANN index on this Table vector index. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `indexParams` | [`IvfPQIndexConfig`](../interfaces/IvfPQIndexConfig.md) | The parameters of this Index, | - -#### Returns - -`Promise`\<`any`\> - -**`See`** - -VectorIndexParams. - -#### Implementation of - -[Table](../interfaces/Table.md).[createIndex](../interfaces/Table.md#createindex) - -#### Defined in - -[index.ts:1003](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1003) - -___ - -### createScalarIndex - -▸ **createScalarIndex**(`column`, `replace?`): `Promise`\<`void`\> - -Create a scalar index on this Table for the given column - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `column` | `string` | The column to index | -| `replace?` | `boolean` | If false, fail if an index already exists on the column it is always set to true for remote connections Scalar indices, like vector indices, can be used to speed up scans. A scalar index can speed up scans that contain filter expressions on the indexed column. For example, the following scan will be faster if the column `my_col` has a scalar index: ```ts const con = await lancedb.connect('./.lancedb'); const table = await con.openTable('images'); const results = await table.where('my_col = 7').execute(); ``` Scalar indices can also speed up scans containing a vector search and a prefilter: ```ts const con = await lancedb.connect('././lancedb'); const table = await con.openTable('images'); const results = await table.search([1.0, 2.0]).where('my_col != 7').prefilter(true); ``` Scalar indices can only speed up scans for basic filters using equality, comparison, range (e.g. `my_col BETWEEN 0 AND 100`), and set membership (e.g. `my_col IN (0, 1, 2)`) Scalar indices can be used if the filter contains multiple indexed columns and the filter criteria are AND'd or OR'd together (e.g. `my_col < 0 AND other_col> 100`) Scalar indices may be used if the filter contains non-indexed columns but, depending on the structure of the filter, they may not be usable. For example, if the column `not_indexed` does not have a scalar index then the filter `my_col = 0 OR not_indexed = 1` will not be able to use any scalar index on `my_col`. | - -#### Returns - -`Promise`\<`void`\> - -**`Examples`** - -```ts -const con = await lancedb.connect('././lancedb') -const table = await con.openTable('images') -await table.createScalarIndex('my_col') -``` - -#### Implementation of - -[Table](../interfaces/Table.md).[createScalarIndex](../interfaces/Table.md#createscalarindex) - -#### Defined in - -[index.ts:1011](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1011) - -___ - -### delete - -▸ **delete**(`filter`): `Promise`\<`void`\> - -Delete rows from this table. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `filter` | `string` | A filter in the same format used by a sql WHERE clause. | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[delete](../interfaces/Table.md#delete) - -#### Defined in - -[index.ts:1030](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1030) - -___ - -### dropColumns - -▸ **dropColumns**(`columnNames`): `Promise`\<`void`\> - -Drop one or more columns from the dataset - -This is a metadata-only operation and does not remove the data from the -underlying storage. In order to remove the data, you must subsequently -call ``compact_files`` to rewrite the data without the removed columns and -then call ``cleanup_files`` to remove the old files. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `columnNames` | `string`[] | The names of the columns to drop. These can be nested column references (e.g. "a.b.c") or top-level column names (e.g. "a"). | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[dropColumns](../interfaces/Table.md#dropcolumns) - -#### Defined in - -[index.ts:1205](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1205) - -___ - -### filter - -▸ **filter**(`value`): [`Query`](Query.md)\<`T`\> - -Creates a filter query to find all rows matching the specified criteria - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `string` | The filter criteria (like SQL where clause syntax) | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[filter](../interfaces/Table.md#filter) - -#### Defined in - -[index.ts:934](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L934) - -___ - -### getSchema - -▸ `Private` **getSchema**(): `Promise`\<`Schema`\<`any`\>\> - -#### Returns - -`Promise`\<`Schema`\<`any`\>\> - -#### Defined in - -[index.ts:1176](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1176) - -___ - -### indexStats - -▸ **indexStats**(`indexName`): `Promise`\<[`IndexStats`](../interfaces/IndexStats.md)\> - -Get statistics about an index. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `indexName` | `string` | - -#### Returns - -`Promise`\<[`IndexStats`](../interfaces/IndexStats.md)\> - -#### Implementation of - -[Table](../interfaces/Table.md).[indexStats](../interfaces/Table.md#indexstats) - -#### Defined in - -[index.ts:1167](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1167) - -___ - -### listIndices - -▸ **listIndices**(): `Promise`\<[`VectorIndex`](../interfaces/VectorIndex.md)[]\> - -List the indicies on this table. - -#### Returns - -`Promise`\<[`VectorIndex`](../interfaces/VectorIndex.md)[]\> - -#### Implementation of - -[Table](../interfaces/Table.md).[listIndices](../interfaces/Table.md#listindices) - -#### Defined in - -[index.ts:1163](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1163) - -___ - -### mergeInsert - -▸ **mergeInsert**(`on`, `data`, `args`): `Promise`\<`void`\> - -Runs a "merge insert" operation on the table - -This operation can add rows, update rows, and remove rows all in a single -transaction. It is a very generic tool that can be used to create -behaviors like "insert if not exists", "update or insert (i.e. upsert)", -or even replace a portion of existing data with new data (e.g. replace -all data where month="january") - -The merge insert operation works by combining new data from a -**source table** with existing data in a **target table** by using a -join. There are three categories of records. - -"Matched" records are records that exist in both the source table and -the target table. "Not matched" records exist only in the source table -(e.g. these are new data) "Not matched by source" records exist only -in the target table (this is old data) - -The MergeInsertArgs can be used to customize what should happen for -each category of data. - -Please note that the data may appear to be reordered as part of this -operation. This is because updated rows will be deleted from the -dataset and then reinserted at the end with the new values. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `on` | `string` | a column to join on. This is how records from the source table and target table are matched. | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | the new data to insert | -| `args` | [`MergeInsertArgs`](../interfaces/MergeInsertArgs.md) | parameters controlling how the operation should behave | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[mergeInsert](../interfaces/Table.md#mergeinsert) - -#### Defined in - -[index.ts:1065](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1065) - -___ - -### overwrite - -▸ **overwrite**(`data`): `Promise`\<`number`\> - -Insert records into this Table, replacing its contents. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Records to be inserted into the Table | - -#### Returns - -`Promise`\<`number`\> - -The number of rows added to the table - -#### Implementation of - -[Table](../interfaces/Table.md).[overwrite](../interfaces/Table.md#overwrite) - -#### Defined in - -[index.ts:977](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L977) - -___ - -### search - -▸ **search**(`query`): [`Query`](Query.md)\<`T`\> - -Creates a search query to find the nearest neighbors of the given search term - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `query` | `T` | The query search term | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[search](../interfaces/Table.md#search) - -#### Defined in - -[index.ts:926](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L926) - -___ - -### update - -▸ **update**(`args`): `Promise`\<`void`\> - -Update rows in this table. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `args` | [`UpdateArgs`](../interfaces/UpdateArgs.md) \| [`UpdateSqlArgs`](../interfaces/UpdateSqlArgs.md) | see [UpdateArgs](../interfaces/UpdateArgs.md) and [UpdateSqlArgs](../interfaces/UpdateSqlArgs.md) for more details | - -#### Returns - -`Promise`\<`void`\> - -#### Implementation of - -[Table](../interfaces/Table.md).[update](../interfaces/Table.md#update) - -#### Defined in - -[index.ts:1043](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1043) - -___ - -### withMiddleware - -▸ **withMiddleware**(`middleware`): [`Table`](../interfaces/Table.md)\<`T`\> - -Instrument the behavior of this Table with middleware. - -The middleware will be called in the order they are added. - -Currently this functionality is only supported for remote tables. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `middleware` | `HttpMiddleware` | - -#### Returns - -[`Table`](../interfaces/Table.md)\<`T`\> - -- this Table instrumented by the passed middleware - -#### Implementation of - -[Table](../interfaces/Table.md).[withMiddleware](../interfaces/Table.md#withmiddleware) - -#### Defined in - -[index.ts:1209](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1209) diff --git a/docs/src/javascript/classes/MakeArrowTableOptions.md b/docs/src/javascript/classes/MakeArrowTableOptions.md deleted file mode 100644 index 93863765..00000000 --- a/docs/src/javascript/classes/MakeArrowTableOptions.md +++ /dev/null @@ -1,82 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / MakeArrowTableOptions - -# Class: MakeArrowTableOptions - -Options to control the makeArrowTable call. - -## Table of contents - -### Constructors - -- [constructor](MakeArrowTableOptions.md#constructor) - -### Properties - -- [dictionaryEncodeStrings](MakeArrowTableOptions.md#dictionaryencodestrings) -- [embeddings](MakeArrowTableOptions.md#embeddings) -- [schema](MakeArrowTableOptions.md#schema) -- [vectorColumns](MakeArrowTableOptions.md#vectorcolumns) - -## Constructors - -### constructor - -• **new MakeArrowTableOptions**(`values?`) - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `values?` | `Partial`\<[`MakeArrowTableOptions`](MakeArrowTableOptions.md)\> | - -#### Defined in - -[arrow.ts:98](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L98) - -## Properties - -### dictionaryEncodeStrings - -• **dictionaryEncodeStrings**: `boolean` = `false` - -If true then string columns will be encoded with dictionary encoding - -Set this to true if your string columns tend to repeat the same values -often. For more precise control use the `schema` property to specify the -data type for individual columns. - -If `schema` is provided then this property is ignored. - -#### Defined in - -[arrow.ts:96](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L96) - -___ - -### embeddings - -• `Optional` **embeddings**: [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`any`\> - -#### Defined in - -[arrow.ts:85](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L85) - -___ - -### schema - -• `Optional` **schema**: `Schema`\<`any`\> - -#### Defined in - -[arrow.ts:63](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L63) - -___ - -### vectorColumns - -• **vectorColumns**: `Record`\<`string`, `VectorColumnOptions`\> - -#### Defined in - -[arrow.ts:81](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L81) diff --git a/docs/src/javascript/classes/OpenAIEmbeddingFunction.md b/docs/src/javascript/classes/OpenAIEmbeddingFunction.md deleted file mode 100644 index f152b0de..00000000 --- a/docs/src/javascript/classes/OpenAIEmbeddingFunction.md +++ /dev/null @@ -1,105 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / OpenAIEmbeddingFunction - -# Class: OpenAIEmbeddingFunction - -An embedding function that automatically creates vector representation for a given column. - -## Implements - -- [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`string`\> - -## Table of contents - -### Constructors - -- [constructor](OpenAIEmbeddingFunction.md#constructor) - -### Properties - -- [\_modelName](OpenAIEmbeddingFunction.md#_modelname) -- [\_openai](OpenAIEmbeddingFunction.md#_openai) -- [sourceColumn](OpenAIEmbeddingFunction.md#sourcecolumn) - -### Methods - -- [embed](OpenAIEmbeddingFunction.md#embed) - -## Constructors - -### constructor - -• **new OpenAIEmbeddingFunction**(`sourceColumn`, `openAIKey`, `modelName?`) - -#### Parameters - -| Name | Type | Default value | -| :------ | :------ | :------ | -| `sourceColumn` | `string` | `undefined` | -| `openAIKey` | `string` | `undefined` | -| `modelName` | `string` | `'text-embedding-ada-002'` | - -#### Defined in - -[embedding/openai.ts:22](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/openai.ts#L22) - -## Properties - -### \_modelName - -• `Private` `Readonly` **\_modelName**: `string` - -#### Defined in - -[embedding/openai.ts:20](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/openai.ts#L20) - -___ - -### \_openai - -• `Private` `Readonly` **\_openai**: `OpenAI` - -#### Defined in - -[embedding/openai.ts:19](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/openai.ts#L19) - -___ - -### sourceColumn - -• **sourceColumn**: `string` - -The name of the column that will be used as input for the Embedding Function. - -#### Implementation of - -[EmbeddingFunction](../interfaces/EmbeddingFunction.md).[sourceColumn](../interfaces/EmbeddingFunction.md#sourcecolumn) - -#### Defined in - -[embedding/openai.ts:56](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/openai.ts#L56) - -## Methods - -### embed - -▸ **embed**(`data`): `Promise`\<`number`[][]\> - -Creates a vector representation for the given values. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `data` | `string`[] | - -#### Returns - -`Promise`\<`number`[][]\> - -#### Implementation of - -[EmbeddingFunction](../interfaces/EmbeddingFunction.md).[embed](../interfaces/EmbeddingFunction.md#embed) - -#### Defined in - -[embedding/openai.ts:43](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/openai.ts#L43) diff --git a/docs/src/javascript/classes/Query.md b/docs/src/javascript/classes/Query.md deleted file mode 100644 index 08944e96..00000000 --- a/docs/src/javascript/classes/Query.md +++ /dev/null @@ -1,432 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / Query - -# Class: Query\ - -A builder for nearest neighbor queries for LanceDB. - -## Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `number`[] | - -## Table of contents - -### Constructors - -- [constructor](Query.md#constructor) - -### Properties - -- [\_embeddings](Query.md#_embeddings) -- [\_fastSearch](Query.md#_fastsearch) -- [\_filter](Query.md#_filter) -- [\_limit](Query.md#_limit) -- [\_metricType](Query.md#_metrictype) -- [\_nprobes](Query.md#_nprobes) -- [\_prefilter](Query.md#_prefilter) -- [\_query](Query.md#_query) -- [\_queryVector](Query.md#_queryvector) -- [\_refineFactor](Query.md#_refinefactor) -- [\_select](Query.md#_select) -- [\_tbl](Query.md#_tbl) -- [where](Query.md#where) - -### Methods - -- [execute](Query.md#execute) -- [fastSearch](Query.md#fastsearch) -- [filter](Query.md#filter) -- [isElectron](Query.md#iselectron) -- [limit](Query.md#limit) -- [metricType](Query.md#metrictype) -- [nprobes](Query.md#nprobes) -- [prefilter](Query.md#prefilter) -- [refineFactor](Query.md#refinefactor) -- [select](Query.md#select) - -## Constructors - -### constructor - -• **new Query**\<`T`\>(`query?`, `tbl?`, `embeddings?`) - -#### Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `number`[] | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `query?` | `T` | -| `tbl?` | `any` | -| `embeddings?` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> | - -#### Defined in - -[query.ts:39](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L39) - -## Properties - -### \_embeddings - -• `Protected` `Optional` `Readonly` **\_embeddings**: [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)\<`T`\> - -#### Defined in - -[query.ts:37](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L37) - -___ - -### \_fastSearch - -• `Private` **\_fastSearch**: `boolean` - -#### Defined in - -[query.ts:36](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L36) - -___ - -### \_filter - -• `Private` `Optional` **\_filter**: `string` - -#### Defined in - -[query.ts:33](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L33) - -___ - -### \_limit - -• `Private` `Optional` **\_limit**: `number` - -#### Defined in - -[query.ts:29](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L29) - -___ - -### \_metricType - -• `Private` `Optional` **\_metricType**: [`MetricType`](../enums/MetricType.md) - -#### Defined in - -[query.ts:34](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L34) - -___ - -### \_nprobes - -• `Private` **\_nprobes**: `number` - -#### Defined in - -[query.ts:31](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L31) - -___ - -### \_prefilter - -• `Private` **\_prefilter**: `boolean` - -#### Defined in - -[query.ts:35](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L35) - -___ - -### \_query - -• `Private` `Optional` `Readonly` **\_query**: `T` - -#### Defined in - -[query.ts:26](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L26) - -___ - -### \_queryVector - -• `Private` `Optional` **\_queryVector**: `number`[] - -#### Defined in - -[query.ts:28](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L28) - -___ - -### \_refineFactor - -• `Private` `Optional` **\_refineFactor**: `number` - -#### Defined in - -[query.ts:30](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L30) - -___ - -### \_select - -• `Private` `Optional` **\_select**: `string`[] - -#### Defined in - -[query.ts:32](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L32) - -___ - -### \_tbl - -• `Private` `Optional` `Readonly` **\_tbl**: `any` - -#### Defined in - -[query.ts:27](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L27) - -___ - -### where - -• **where**: (`value`: `string`) => [`Query`](Query.md)\<`T`\> - -#### Type declaration - -▸ (`value`): [`Query`](Query.md)\<`T`\> - -A filter statement to be applied to this query. - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `string` | A filter in the same format used by a sql WHERE clause. | - -##### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:90](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L90) - -## Methods - -### execute - -▸ **execute**\<`T`\>(): `Promise`\<`T`[]\> - -Execute the query and return the results as an Array of Objects - -#### Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `Record`\<`string`, `unknown`\> | - -#### Returns - -`Promise`\<`T`[]\> - -#### Defined in - -[query.ts:127](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L127) - -___ - -### fastSearch - -▸ **fastSearch**(`value`): [`Query`](Query.md)\<`T`\> - -Skip searching un-indexed data. This can make search faster, but will miss -any data that is not yet indexed. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `value` | `boolean` | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:119](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L119) - -___ - -### filter - -▸ **filter**(`value`): [`Query`](Query.md)\<`T`\> - -A filter statement to be applied to this query. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `string` | A filter in the same format used by a sql WHERE clause. | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:85](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L85) - -___ - -### isElectron - -▸ `Private` **isElectron**(): `boolean` - -#### Returns - -`boolean` - -#### Defined in - -[query.ts:155](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L155) - -___ - -### limit - -▸ **limit**(`value`): [`Query`](Query.md)\<`T`\> - -Sets the number of results that will be returned -default value is 10 - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `number` | number of results | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:58](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L58) - -___ - -### metricType - -▸ **metricType**(`value`): [`Query`](Query.md)\<`T`\> - -The MetricType used for this Query. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | [`MetricType`](../enums/MetricType.md) | The metric to the. | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -**`See`** - -MetricType for the different options - -#### Defined in - -[query.ts:105](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L105) - -___ - -### nprobes - -▸ **nprobes**(`value`): [`Query`](Query.md)\<`T`\> - -The number of probes used. A higher number makes search more accurate but also slower. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `number` | The number of probes used. | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:76](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L76) - -___ - -### prefilter - -▸ **prefilter**(`value`): [`Query`](Query.md)\<`T`\> - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `value` | `boolean` | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:110](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L110) - -___ - -### refineFactor - -▸ **refineFactor**(`value`): [`Query`](Query.md)\<`T`\> - -Refine the results by reading extra elements and re-ranking them in memory. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `number` | refine factor to use in this query. | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:67](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L67) - -___ - -### select - -▸ **select**(`value`): [`Query`](Query.md)\<`T`\> - -Return only the specified columns. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `value` | `string`[] | Only select the specified columns. If not specified, all columns will be returned. | - -#### Returns - -[`Query`](Query.md)\<`T`\> - -#### Defined in - -[query.ts:96](https://github.com/lancedb/lancedb/blob/92179835/node/src/query.ts#L96) diff --git a/docs/src/javascript/enums/IndexStatus.md b/docs/src/javascript/enums/IndexStatus.md deleted file mode 100644 index 4cbd3f8e..00000000 --- a/docs/src/javascript/enums/IndexStatus.md +++ /dev/null @@ -1,52 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / IndexStatus - -# Enumeration: IndexStatus - -## Table of contents - -### Enumeration Members - -- [Done](IndexStatus.md#done) -- [Failed](IndexStatus.md#failed) -- [Indexing](IndexStatus.md#indexing) -- [Pending](IndexStatus.md#pending) - -## Enumeration Members - -### Done - -• **Done** = ``"done"`` - -#### Defined in - -[index.ts:713](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L713) - -___ - -### Failed - -• **Failed** = ``"failed"`` - -#### Defined in - -[index.ts:714](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L714) - -___ - -### Indexing - -• **Indexing** = ``"indexing"`` - -#### Defined in - -[index.ts:712](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L712) - -___ - -### Pending - -• **Pending** = ``"pending"`` - -#### Defined in - -[index.ts:711](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L711) diff --git a/docs/src/javascript/enums/MetricType.md b/docs/src/javascript/enums/MetricType.md deleted file mode 100644 index 8c1300b2..00000000 --- a/docs/src/javascript/enums/MetricType.md +++ /dev/null @@ -1,49 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / MetricType - -# Enumeration: MetricType - -Distance metrics type. - -## Table of contents - -### Enumeration Members - -- [Cosine](MetricType.md#cosine) -- [Dot](MetricType.md#dot) -- [l2](MetricType.md#l2) - -## Enumeration Members - -### Cosine - -• **Cosine** = ``"cosine"`` - -Cosine distance - -#### Defined in - -[index.ts:1381](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1381) - -___ - -### Dot - -• **Dot** = ``"dot"`` - -Dot product - -#### Defined in - -[index.ts:1386](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1386) - -___ - -### L2 - -• **L2** = ``"l2"`` - -Euclidean distance - -#### Defined in - -[index.ts:1376](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1376) diff --git a/docs/src/javascript/enums/WriteMode.md b/docs/src/javascript/enums/WriteMode.md deleted file mode 100644 index 42b0c8bf..00000000 --- a/docs/src/javascript/enums/WriteMode.md +++ /dev/null @@ -1,49 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / WriteMode - -# Enumeration: WriteMode - -Write mode for writing a table. - -## Table of contents - -### Enumeration Members - -- [Append](WriteMode.md#append) -- [Create](WriteMode.md#create) -- [Overwrite](WriteMode.md#overwrite) - -## Enumeration Members - -### Append - -• **Append** = ``"append"`` - -Append new data to the table. - -#### Defined in - -[index.ts:1347](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1347) - -___ - -### Create - -• **Create** = ``"create"`` - -Create a new [Table](../interfaces/Table.md). - -#### Defined in - -[index.ts:1343](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1343) - -___ - -### Overwrite - -• **Overwrite** = ``"overwrite"`` - -Overwrite the existing [Table](../interfaces/Table.md) if presented. - -#### Defined in - -[index.ts:1345](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1345) diff --git a/docs/src/javascript/interfaces/AwsCredentials.md b/docs/src/javascript/interfaces/AwsCredentials.md deleted file mode 100644 index 68e31a83..00000000 --- a/docs/src/javascript/interfaces/AwsCredentials.md +++ /dev/null @@ -1,41 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / AwsCredentials - -# Interface: AwsCredentials - -## Table of contents - -### Properties - -- [accessKeyId](AwsCredentials.md#accesskeyid) -- [secretKey](AwsCredentials.md#secretkey) -- [sessionToken](AwsCredentials.md#sessiontoken) - -## Properties - -### accessKeyId - -• **accessKeyId**: `string` - -#### Defined in - -[index.ts:68](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L68) - -___ - -### secretKey - -• **secretKey**: `string` - -#### Defined in - -[index.ts:70](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L70) - -___ - -### sessionToken - -• `Optional` **sessionToken**: `string` - -#### Defined in - -[index.ts:72](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L72) diff --git a/docs/src/javascript/interfaces/CleanupStats.md b/docs/src/javascript/interfaces/CleanupStats.md deleted file mode 100644 index f76e0889..00000000 --- a/docs/src/javascript/interfaces/CleanupStats.md +++ /dev/null @@ -1,34 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / CleanupStats - -# Interface: CleanupStats - -## Table of contents - -### Properties - -- [bytesRemoved](CleanupStats.md#bytesremoved) -- [oldVersions](CleanupStats.md#oldversions) - -## Properties - -### bytesRemoved - -• **bytesRemoved**: `number` - -The number of bytes removed from disk. - -#### Defined in - -[index.ts:1218](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1218) - -___ - -### oldVersions - -• **oldVersions**: `number` - -The number of old table versions removed. - -#### Defined in - -[index.ts:1222](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1222) diff --git a/docs/src/javascript/interfaces/ColumnAlteration.md b/docs/src/javascript/interfaces/ColumnAlteration.md deleted file mode 100644 index 88eaa827..00000000 --- a/docs/src/javascript/interfaces/ColumnAlteration.md +++ /dev/null @@ -1,53 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / ColumnAlteration - -# Interface: ColumnAlteration - -A definition of a column alteration. The alteration changes the column at -`path` to have the new name `name`, to be nullable if `nullable` is true, -and to have the data type `data_type`. At least one of `rename` or `nullable` -must be provided. - -## Table of contents - -### Properties - -- [nullable](ColumnAlteration.md#nullable) -- [path](ColumnAlteration.md#path) -- [rename](ColumnAlteration.md#rename) - -## Properties - -### nullable - -• `Optional` **nullable**: `boolean` - -Set the new nullability. Note that a nullable column cannot be made non-nullable. - -#### Defined in - -[index.ts:638](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L638) - -___ - -### path - -• **path**: `string` - -The path to the column to alter. This is a dot-separated path to the column. -If it is a top-level column then it is just the name of the column. If it is -a nested column then it is the path to the column, e.g. "a.b.c" for a column -`c` nested inside a column `b` nested inside a column `a`. - -#### Defined in - -[index.ts:633](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L633) - -___ - -### rename - -• `Optional` **rename**: `string` - -#### Defined in - -[index.ts:634](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L634) diff --git a/docs/src/javascript/interfaces/CompactionMetrics.md b/docs/src/javascript/interfaces/CompactionMetrics.md deleted file mode 100644 index 0f98591b..00000000 --- a/docs/src/javascript/interfaces/CompactionMetrics.md +++ /dev/null @@ -1,62 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / CompactionMetrics - -# Interface: CompactionMetrics - -## Table of contents - -### Properties - -- [filesAdded](CompactionMetrics.md#filesadded) -- [filesRemoved](CompactionMetrics.md#filesremoved) -- [fragmentsAdded](CompactionMetrics.md#fragmentsadded) -- [fragmentsRemoved](CompactionMetrics.md#fragmentsremoved) - -## Properties - -### filesAdded - -• **filesAdded**: `number` - -The number of files added. This is typically equal to the number of -fragments added. - -#### Defined in - -[index.ts:1273](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1273) - -___ - -### filesRemoved - -• **filesRemoved**: `number` - -The number of files that were removed. Each fragment may have more than one -file. - -#### Defined in - -[index.ts:1268](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1268) - -___ - -### fragmentsAdded - -• **fragmentsAdded**: `number` - -The number of new fragments that were created. - -#### Defined in - -[index.ts:1263](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1263) - -___ - -### fragmentsRemoved - -• **fragmentsRemoved**: `number` - -The number of fragments that were removed. - -#### Defined in - -[index.ts:1259](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1259) diff --git a/docs/src/javascript/interfaces/CompactionOptions.md b/docs/src/javascript/interfaces/CompactionOptions.md deleted file mode 100644 index 7da8e492..00000000 --- a/docs/src/javascript/interfaces/CompactionOptions.md +++ /dev/null @@ -1,80 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / CompactionOptions - -# Interface: CompactionOptions - -## Table of contents - -### Properties - -- [materializeDeletions](CompactionOptions.md#materializedeletions) -- [materializeDeletionsThreshold](CompactionOptions.md#materializedeletionsthreshold) -- [maxRowsPerGroup](CompactionOptions.md#maxrowspergroup) -- [numThreads](CompactionOptions.md#numthreads) -- [targetRowsPerFragment](CompactionOptions.md#targetrowsperfragment) - -## Properties - -### materializeDeletions - -• `Optional` **materializeDeletions**: `boolean` - -If true, fragments that have rows that are deleted may be compacted to -remove the deleted rows. This can improve the performance of queries. -Default is true. - -#### Defined in - -[index.ts:1241](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1241) - -___ - -### materializeDeletionsThreshold - -• `Optional` **materializeDeletionsThreshold**: `number` - -A number between 0 and 1, representing the proportion of rows that must be -marked deleted before a fragment is a candidate for compaction to remove -the deleted rows. Default is 10%. - -#### Defined in - -[index.ts:1247](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1247) - -___ - -### maxRowsPerGroup - -• `Optional` **maxRowsPerGroup**: `number` - -The maximum number of T per group. Defaults to 1024. - -#### Defined in - -[index.ts:1235](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1235) - -___ - -### numThreads - -• `Optional` **numThreads**: `number` - -The number of threads to use for compaction. If not provided, defaults to -the number of cores on the machine. - -#### Defined in - -[index.ts:1252](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1252) - -___ - -### targetRowsPerFragment - -• `Optional` **targetRowsPerFragment**: `number` - -The number of rows per fragment to target. Fragments that have fewer rows -will be compacted into adjacent fragments to produce larger fragments. -Defaults to 1024 * 1024. - -#### Defined in - -[index.ts:1231](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1231) diff --git a/docs/src/javascript/interfaces/Connection.md b/docs/src/javascript/interfaces/Connection.md deleted file mode 100644 index c7a7b24a..00000000 --- a/docs/src/javascript/interfaces/Connection.md +++ /dev/null @@ -1,248 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / Connection - -# Interface: Connection - -A LanceDB Connection that allows you to open tables and create new ones. - -Connection could be local against filesystem or remote against a server. - -## Implemented by - -- [`LocalConnection`](../classes/LocalConnection.md) - -## Table of contents - -### Properties - -- [uri](Connection.md#uri) - -### Methods - -- [createTable](Connection.md#createtable) -- [dropTable](Connection.md#droptable) -- [openTable](Connection.md#opentable) -- [tableNames](Connection.md#tablenames) -- [withMiddleware](Connection.md#withmiddleware) - -## Properties - -### uri - -• **uri**: `string` - -#### Defined in - -[index.ts:261](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L261) - -## Methods - -### createTable - -▸ **createTable**\<`T`\>(`«destructured»`): `Promise`\<[`Table`](Table.md)\<`T`\>\> - -Creates a new Table, optionally initializing it with new data. - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `«destructured»` | [`CreateTableOptions`](CreateTableOptions.md)\<`T`\> | - -#### Returns - -`Promise`\<[`Table`](Table.md)\<`T`\>\> - -#### Defined in - -[index.ts:285](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L285) - -▸ **createTable**(`name`, `data`): `Promise`\<[`Table`](Table.md)\<`number`[]\>\> - -Creates a new Table and initialize it with new data. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Non-empty Array of Records to be inserted into the table | - -#### Returns - -`Promise`\<[`Table`](Table.md)\<`number`[]\>\> - -#### Defined in - -[index.ts:299](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L299) - -▸ **createTable**(`name`, `data`, `options`): `Promise`\<[`Table`](Table.md)\<`number`[]\>\> - -Creates a new Table and initialize it with new data. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Non-empty Array of Records to be inserted into the table | -| `options` | [`WriteOptions`](WriteOptions.md) | The write options to use when creating the table. | - -#### Returns - -`Promise`\<[`Table`](Table.md)\<`number`[]\>\> - -#### Defined in - -[index.ts:311](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L311) - -▸ **createTable**\<`T`\>(`name`, `data`, `embeddings`): `Promise`\<[`Table`](Table.md)\<`T`\>\> - -Creates a new Table and initialize it with new data. - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Non-empty Array of Records to be inserted into the table | -| `embeddings` | [`EmbeddingFunction`](EmbeddingFunction.md)\<`T`\> | An embedding function to use on this table | - -#### Returns - -`Promise`\<[`Table`](Table.md)\<`T`\>\> - -#### Defined in - -[index.ts:324](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L324) - -▸ **createTable**\<`T`\>(`name`, `data`, `embeddings`, `options`): `Promise`\<[`Table`](Table.md)\<`T`\>\> - -Creates a new Table and initialize it with new data. - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Non-empty Array of Records to be inserted into the table | -| `embeddings` | [`EmbeddingFunction`](EmbeddingFunction.md)\<`T`\> | An embedding function to use on this table | -| `options` | [`WriteOptions`](WriteOptions.md) | The write options to use when creating the table. | - -#### Returns - -`Promise`\<[`Table`](Table.md)\<`T`\>\> - -#### Defined in - -[index.ts:337](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L337) - -___ - -### dropTable - -▸ **dropTable**(`name`): `Promise`\<`void`\> - -Drop an existing table. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table to drop. | - -#### Returns - -`Promise`\<`void`\> - -#### Defined in - -[index.ts:348](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L348) - -___ - -### openTable - -▸ **openTable**\<`T`\>(`name`, `embeddings?`): `Promise`\<[`Table`](Table.md)\<`T`\>\> - -Open a table in the database. - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `name` | `string` | The name of the table. | -| `embeddings?` | [`EmbeddingFunction`](EmbeddingFunction.md)\<`T`\> | An embedding function to use on this table | - -#### Returns - -`Promise`\<[`Table`](Table.md)\<`T`\>\> - -#### Defined in - -[index.ts:271](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L271) - -___ - -### tableNames - -▸ **tableNames**(): `Promise`\<`string`[]\> - -#### Returns - -`Promise`\<`string`[]\> - -#### Defined in - -[index.ts:263](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L263) - -___ - -### withMiddleware - -▸ **withMiddleware**(`middleware`): [`Connection`](Connection.md) - -Instrument the behavior of this Connection with middleware. - -The middleware will be called in the order they are added. - -Currently this functionality is only supported for remote Connections. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `middleware` | `HttpMiddleware` | - -#### Returns - -[`Connection`](Connection.md) - -- this Connection instrumented by the passed middleware - -#### Defined in - -[index.ts:360](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L360) diff --git a/docs/src/javascript/interfaces/ConnectionOptions.md b/docs/src/javascript/interfaces/ConnectionOptions.md deleted file mode 100644 index 92bf594a..00000000 --- a/docs/src/javascript/interfaces/ConnectionOptions.md +++ /dev/null @@ -1,154 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / ConnectionOptions - -# Interface: ConnectionOptions - -## Table of contents - -### Properties - -- [apiKey](ConnectionOptions.md#apikey) -- [awsCredentials](ConnectionOptions.md#awscredentials) -- [awsRegion](ConnectionOptions.md#awsregion) -- [hostOverride](ConnectionOptions.md#hostoverride) -- [readConsistencyInterval](ConnectionOptions.md#readconsistencyinterval) -- [region](ConnectionOptions.md#region) -- [storageOptions](ConnectionOptions.md#storageoptions) -- [timeout](ConnectionOptions.md#timeout) -- [uri](ConnectionOptions.md#uri) - -## Properties - -### apiKey - -• `Optional` **apiKey**: `string` - -API key for the remote connections - -Can also be passed by setting environment variable `LANCEDB_API_KEY` - -#### Defined in - -[index.ts:112](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L112) - -___ - -### awsCredentials - -• `Optional` **awsCredentials**: [`AwsCredentials`](AwsCredentials.md) - -User provided AWS crednetials. - -If not provided, LanceDB will use the default credentials provider chain. - -**`Deprecated`** - -Pass `aws_access_key_id`, `aws_secret_access_key`, and `aws_session_token` -through `storageOptions` instead. - -#### Defined in - -[index.ts:92](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L92) - -___ - -### awsRegion - -• `Optional` **awsRegion**: `string` - -AWS region to connect to. Default is defaultAwsRegion - -**`Deprecated`** - -Pass `region` through `storageOptions` instead. - -#### Defined in - -[index.ts:98](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L98) - -___ - -### hostOverride - -• `Optional` **hostOverride**: `string` - -Override the host URL for the remote connection. - -This is useful for local testing. - -#### Defined in - -[index.ts:122](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L122) - -___ - -### readConsistencyInterval - -• `Optional` **readConsistencyInterval**: `number` - -(For LanceDB OSS only): The interval, in seconds, at which to check for -updates to the table from other processes. If None, then consistency is not -checked. For performance reasons, this is the default. For strong -consistency, set this to zero seconds. Then every read will check for -updates from other processes. As a compromise, you can set this to a -non-zero value for eventual consistency. If more than that interval -has passed since the last check, then the table will be checked for updates. -Note: this consistency only applies to read operations. Write operations are -always consistent. - -#### Defined in - -[index.ts:140](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L140) - -___ - -### region - -• `Optional` **region**: `string` - -Region to connect. Default is 'us-east-1' - -#### Defined in - -[index.ts:115](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L115) - -___ - -### storageOptions - -• `Optional` **storageOptions**: `Record`\<`string`, `string`\> - -User provided options for object storage. For example, S3 credentials or request timeouts. - -The various options are described at https://lancedb.github.io/lancedb/guides/storage/ - -#### Defined in - -[index.ts:105](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L105) - -___ - -### timeout - -• `Optional` **timeout**: `number` - -Duration in milliseconds for request timeout. Default = 10,000 (10 seconds) - -#### Defined in - -[index.ts:127](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L127) - -___ - -### uri - -• **uri**: `string` - -LanceDB database URI. - -- `/path/to/database` - local database -- `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage -- `db://host:port` - remote database (LanceDB cloud) - -#### Defined in - -[index.ts:83](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L83) diff --git a/docs/src/javascript/interfaces/CreateTableOptions.md b/docs/src/javascript/interfaces/CreateTableOptions.md deleted file mode 100644 index 2c933fdc..00000000 --- a/docs/src/javascript/interfaces/CreateTableOptions.md +++ /dev/null @@ -1,69 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / CreateTableOptions - -# Interface: CreateTableOptions\ - -## Type parameters - -| Name | -| :------ | -| `T` | - -## Table of contents - -### Properties - -- [data](CreateTableOptions.md#data) -- [embeddingFunction](CreateTableOptions.md#embeddingfunction) -- [name](CreateTableOptions.md#name) -- [schema](CreateTableOptions.md#schema) -- [writeOptions](CreateTableOptions.md#writeoptions) - -## Properties - -### data - -• `Optional` **data**: `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] - -#### Defined in - -[index.ts:163](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L163) - -___ - -### embeddingFunction - -• `Optional` **embeddingFunction**: [`EmbeddingFunction`](EmbeddingFunction.md)\<`T`\> - -#### Defined in - -[index.ts:169](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L169) - -___ - -### name - -• **name**: `string` - -#### Defined in - -[index.ts:160](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L160) - -___ - -### schema - -• `Optional` **schema**: `Schema`\<`any`\> - -#### Defined in - -[index.ts:166](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L166) - -___ - -### writeOptions - -• `Optional` **writeOptions**: [`WriteOptions`](WriteOptions.md) - -#### Defined in - -[index.ts:172](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L172) diff --git a/docs/src/javascript/interfaces/EmbeddingFunction.md b/docs/src/javascript/interfaces/EmbeddingFunction.md deleted file mode 100644 index 8249a4ab..00000000 --- a/docs/src/javascript/interfaces/EmbeddingFunction.md +++ /dev/null @@ -1,125 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / EmbeddingFunction - -# Interface: EmbeddingFunction\ - -An embedding function that automatically creates vector representation for a given column. - -## Type parameters - -| Name | -| :------ | -| `T` | - -## Implemented by - -- [`OpenAIEmbeddingFunction`](../classes/OpenAIEmbeddingFunction.md) - -## Table of contents - -### Properties - -- [destColumn](EmbeddingFunction.md#destcolumn) -- [embed](EmbeddingFunction.md#embed) -- [embeddingDataType](EmbeddingFunction.md#embeddingdatatype) -- [embeddingDimension](EmbeddingFunction.md#embeddingdimension) -- [excludeSource](EmbeddingFunction.md#excludesource) -- [sourceColumn](EmbeddingFunction.md#sourcecolumn) - -## Properties - -### destColumn - -• `Optional` **destColumn**: `string` - -The name of the column that will contain the embedding - -By default this is "vector" - -#### Defined in - -[embedding/embedding_function.ts:49](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/embedding_function.ts#L49) - -___ - -### embed - -• **embed**: (`data`: `T`[]) => `Promise`\<`number`[][]\> - -#### Type declaration - -▸ (`data`): `Promise`\<`number`[][]\> - -Creates a vector representation for the given values. - -##### Parameters - -| Name | Type | -| :------ | :------ | -| `data` | `T`[] | - -##### Returns - -`Promise`\<`number`[][]\> - -#### Defined in - -[embedding/embedding_function.ts:62](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/embedding_function.ts#L62) - -___ - -### embeddingDataType - -• `Optional` **embeddingDataType**: `Float`\<`Floats`\> - -The data type of the embedding - -The embedding function should return `number`. This will be converted into -an Arrow float array. By default this will be Float32 but this property can -be used to control the conversion. - -#### Defined in - -[embedding/embedding_function.ts:33](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/embedding_function.ts#L33) - -___ - -### embeddingDimension - -• `Optional` **embeddingDimension**: `number` - -The dimension of the embedding - -This is optional, normally this can be determined by looking at the results of -`embed`. If this is not specified, and there is an attempt to apply the embedding -to an empty table, then that process will fail. - -#### Defined in - -[embedding/embedding_function.ts:42](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/embedding_function.ts#L42) - -___ - -### excludeSource - -• `Optional` **excludeSource**: `boolean` - -Should the source column be excluded from the resulting table - -By default the source column is included. Set this to true and -only the embedding will be stored. - -#### Defined in - -[embedding/embedding_function.ts:57](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/embedding_function.ts#L57) - -___ - -### sourceColumn - -• **sourceColumn**: `string` - -The name of the column that will be used as input for the Embedding Function. - -#### Defined in - -[embedding/embedding_function.ts:24](https://github.com/lancedb/lancedb/blob/92179835/node/src/embedding/embedding_function.ts#L24) diff --git a/docs/src/javascript/interfaces/IndexStats.md b/docs/src/javascript/interfaces/IndexStats.md deleted file mode 100644 index 8044b3df..00000000 --- a/docs/src/javascript/interfaces/IndexStats.md +++ /dev/null @@ -1,63 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / IndexStats - -# Interface: IndexStats - -## Table of contents - -### Properties - -- [distanceType](IndexStats.md#distancetype) -- [indexType](IndexStats.md#indextype) -- [numIndexedRows](IndexStats.md#numindexedrows) -- [numIndices](IndexStats.md#numindices) -- [numUnindexedRows](IndexStats.md#numunindexedrows) - -## Properties - -### distanceType - -• `Optional` **distanceType**: `string` - -#### Defined in - -[index.ts:728](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L728) - -___ - -### indexType - -• **indexType**: `string` - -#### Defined in - -[index.ts:727](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L727) - -___ - -### numIndexedRows - -• **numIndexedRows**: ``null`` \| `number` - -#### Defined in - -[index.ts:725](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L725) - -___ - -### numIndices - -• `Optional` **numIndices**: `number` - -#### Defined in - -[index.ts:729](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L729) - -___ - -### numUnindexedRows - -• **numUnindexedRows**: ``null`` \| `number` - -#### Defined in - -[index.ts:726](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L726) diff --git a/docs/src/javascript/interfaces/IvfPQIndexConfig.md b/docs/src/javascript/interfaces/IvfPQIndexConfig.md deleted file mode 100644 index 526787cc..00000000 --- a/docs/src/javascript/interfaces/IvfPQIndexConfig.md +++ /dev/null @@ -1,162 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / IvfPQIndexConfig - -# Interface: IvfPQIndexConfig - -## Table of contents - -### Properties - -- [column](IvfPQIndexConfig.md#column) -- [index\_cache\_size](IvfPQIndexConfig.md#index_cache_size) -- [index\_name](IvfPQIndexConfig.md#index_name) -- [max\_iters](IvfPQIndexConfig.md#max_iters) -- [max\_opq\_iters](IvfPQIndexConfig.md#max_opq_iters) -- [metric\_type](IvfPQIndexConfig.md#metric_type) -- [num\_bits](IvfPQIndexConfig.md#num_bits) -- [num\_partitions](IvfPQIndexConfig.md#num_partitions) -- [num\_sub\_vectors](IvfPQIndexConfig.md#num_sub_vectors) -- [replace](IvfPQIndexConfig.md#replace) -- [type](IvfPQIndexConfig.md#type) -- [use\_opq](IvfPQIndexConfig.md#use_opq) - -## Properties - -### column - -• `Optional` **column**: `string` - -The column to be indexed - -#### Defined in - -[index.ts:1282](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1282) - -___ - -### index\_cache\_size - -• `Optional` **index\_cache\_size**: `number` - -Cache size of the index - -#### Defined in - -[index.ts:1331](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1331) - -___ - -### index\_name - -• `Optional` **index\_name**: `string` - -A unique name for the index - -#### Defined in - -[index.ts:1287](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1287) - -___ - -### max\_iters - -• `Optional` **max\_iters**: `number` - -The max number of iterations for kmeans training. - -#### Defined in - -[index.ts:1302](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1302) - -___ - -### max\_opq\_iters - -• `Optional` **max\_opq\_iters**: `number` - -Max number of iterations to train OPQ, if `use_opq` is true. - -#### Defined in - -[index.ts:1321](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1321) - -___ - -### metric\_type - -• `Optional` **metric\_type**: [`MetricType`](../enums/MetricType.md) - -Metric type, l2 or Cosine - -#### Defined in - -[index.ts:1292](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1292) - -___ - -### num\_bits - -• `Optional` **num\_bits**: `number` - -The number of bits to present one PQ centroid. - -#### Defined in - -[index.ts:1316](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1316) - -___ - -### num\_partitions - -• `Optional` **num\_partitions**: `number` - -The number of partitions this index - -#### Defined in - -[index.ts:1297](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1297) - -___ - -### num\_sub\_vectors - -• `Optional` **num\_sub\_vectors**: `number` - -Number of subvectors to build PQ code - -#### Defined in - -[index.ts:1312](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1312) - -___ - -### replace - -• `Optional` **replace**: `boolean` - -Replace an existing index with the same name if it exists. - -#### Defined in - -[index.ts:1326](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1326) - -___ - -### type - -• **type**: ``"ivf_pq"`` - -#### Defined in - -[index.ts:1333](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1333) - -___ - -### use\_opq - -• `Optional` **use\_opq**: `boolean` - -Train as optimized product quantization. - -#### Defined in - -[index.ts:1307](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1307) diff --git a/docs/src/javascript/interfaces/MergeInsertArgs.md b/docs/src/javascript/interfaces/MergeInsertArgs.md deleted file mode 100644 index 0f88d65a..00000000 --- a/docs/src/javascript/interfaces/MergeInsertArgs.md +++ /dev/null @@ -1,73 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / MergeInsertArgs - -# Interface: MergeInsertArgs - -## Table of contents - -### Properties - -- [whenMatchedUpdateAll](MergeInsertArgs.md#whenmatchedupdateall) -- [whenNotMatchedBySourceDelete](MergeInsertArgs.md#whennotmatchedbysourcedelete) -- [whenNotMatchedInsertAll](MergeInsertArgs.md#whennotmatchedinsertall) - -## Properties - -### whenMatchedUpdateAll - -• `Optional` **whenMatchedUpdateAll**: `string` \| `boolean` - -If true then rows that exist in both the source table (new data) and -the target table (old data) will be updated, replacing the old row -with the corresponding matching row. - -If there are multiple matches then the behavior is undefined. -Currently this causes multiple copies of the row to be created -but that behavior is subject to change. - -Optionally, a filter can be specified. This should be an SQL -filter where fields with the prefix "target." refer to fields -in the target table (old data) and fields with the prefix -"source." refer to fields in the source table (new data). For -example, the filter "target.lastUpdated < source.lastUpdated" will -only update matched rows when the incoming `lastUpdated` value is -newer. - -Rows that do not match the filter will not be updated. Rows that -do not match the filter do become "not matched" rows. - -#### Defined in - -[index.ts:690](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L690) - -___ - -### whenNotMatchedBySourceDelete - -• `Optional` **whenNotMatchedBySourceDelete**: `string` \| `boolean` - -If true then rows that exist only in the target table (old data) -will be deleted. - -If this is a string then it will be treated as an SQL filter and -only rows that both do not match any row in the source table and -match the given filter will be deleted. - -This can be used to replace a selection of existing data with -new data. - -#### Defined in - -[index.ts:707](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L707) - -___ - -### whenNotMatchedInsertAll - -• `Optional` **whenNotMatchedInsertAll**: `boolean` - -If true then rows that exist only in the source table (new data) -will be inserted into the target table. - -#### Defined in - -[index.ts:695](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L695) diff --git a/docs/src/javascript/interfaces/Table.md b/docs/src/javascript/interfaces/Table.md deleted file mode 100644 index 6a88c2e3..00000000 --- a/docs/src/javascript/interfaces/Table.md +++ /dev/null @@ -1,552 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / Table - -# Interface: Table\ - -A LanceDB Table is the collection of Records. Each Record has one or more vector fields. - -## Type parameters - -| Name | Type | -| :------ | :------ | -| `T` | `number`[] | - -## Implemented by - -- [`LocalTable`](../classes/LocalTable.md) - -## Table of contents - -### Properties - -- [add](Table.md#add) -- [countRows](Table.md#countrows) -- [createIndex](Table.md#createindex) -- [createScalarIndex](Table.md#createscalarindex) -- [delete](Table.md#delete) -- [indexStats](Table.md#indexstats) -- [listIndices](Table.md#listindices) -- [mergeInsert](Table.md#mergeinsert) -- [name](Table.md#name) -- [overwrite](Table.md#overwrite) -- [schema](Table.md#schema) -- [search](Table.md#search) -- [update](Table.md#update) - -### Methods - -- [addColumns](Table.md#addcolumns) -- [alterColumns](Table.md#altercolumns) -- [dropColumns](Table.md#dropcolumns) -- [filter](Table.md#filter) -- [withMiddleware](Table.md#withmiddleware) - -## Properties - -### add - -• **add**: (`data`: `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[]) => `Promise`\<`number`\> - -#### Type declaration - -▸ (`data`): `Promise`\<`number`\> - -Insert records into this Table. - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Records to be inserted into the Table | - -##### Returns - -`Promise`\<`number`\> - -The number of rows added to the table - -#### Defined in - -[index.ts:381](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L381) - -___ - -### countRows - -• **countRows**: (`filter?`: `string`) => `Promise`\<`number`\> - -#### Type declaration - -▸ (`filter?`): `Promise`\<`number`\> - -Returns the number of rows in this table. - -##### Parameters - -| Name | Type | -| :------ | :------ | -| `filter?` | `string` | - -##### Returns - -`Promise`\<`number`\> - -#### Defined in - -[index.ts:454](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L454) - -___ - -### createIndex - -• **createIndex**: (`indexParams`: [`IvfPQIndexConfig`](IvfPQIndexConfig.md)) => `Promise`\<`any`\> - -#### Type declaration - -▸ (`indexParams`): `Promise`\<`any`\> - -Create an ANN index on this Table vector index. - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `indexParams` | [`IvfPQIndexConfig`](IvfPQIndexConfig.md) | The parameters of this Index, | - -##### Returns - -`Promise`\<`any`\> - -**`See`** - -VectorIndexParams. - -#### Defined in - -[index.ts:398](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L398) - -___ - -### createScalarIndex - -• **createScalarIndex**: (`column`: `string`, `replace?`: `boolean`) => `Promise`\<`void`\> - -#### Type declaration - -▸ (`column`, `replace?`): `Promise`\<`void`\> - -Create a scalar index on this Table for the given column - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `column` | `string` | The column to index | -| `replace?` | `boolean` | If false, fail if an index already exists on the column it is always set to true for remote connections Scalar indices, like vector indices, can be used to speed up scans. A scalar index can speed up scans that contain filter expressions on the indexed column. For example, the following scan will be faster if the column `my_col` has a scalar index: ```ts const con = await lancedb.connect('./.lancedb'); const table = await con.openTable('images'); const results = await table.where('my_col = 7').execute(); ``` Scalar indices can also speed up scans containing a vector search and a prefilter: ```ts const con = await lancedb.connect('././lancedb'); const table = await con.openTable('images'); const results = await table.search([1.0, 2.0]).where('my_col != 7').prefilter(true); ``` Scalar indices can only speed up scans for basic filters using equality, comparison, range (e.g. `my_col BETWEEN 0 AND 100`), and set membership (e.g. `my_col IN (0, 1, 2)`) Scalar indices can be used if the filter contains multiple indexed columns and the filter criteria are AND'd or OR'd together (e.g. `my_col < 0 AND other_col> 100`) Scalar indices may be used if the filter contains non-indexed columns but, depending on the structure of the filter, they may not be usable. For example, if the column `not_indexed` does not have a scalar index then the filter `my_col = 0 OR not_indexed = 1` will not be able to use any scalar index on `my_col`. | - -##### Returns - -`Promise`\<`void`\> - -**`Examples`** - -```ts -const con = await lancedb.connect('././lancedb') -const table = await con.openTable('images') -await table.createScalarIndex('my_col') -``` - -#### Defined in - -[index.ts:449](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L449) - -___ - -### delete - -• **delete**: (`filter`: `string`) => `Promise`\<`void`\> - -#### Type declaration - -▸ (`filter`): `Promise`\<`void`\> - -Delete rows from this table. - -This can be used to delete a single row, many rows, all rows, or -sometimes no rows (if your predicate matches nothing). - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `filter` | `string` | A filter in the same format used by a sql WHERE clause. The filter must not be empty. | - -##### Returns - -`Promise`\<`void`\> - -**`Examples`** - -```ts -const con = await lancedb.connect("./.lancedb") -const data = [ - {id: 1, vector: [1, 2]}, - {id: 2, vector: [3, 4]}, - {id: 3, vector: [5, 6]}, -]; -const tbl = await con.createTable("my_table", data) -await tbl.delete("id = 2") -await tbl.countRows() // Returns 2 -``` - -If you have a list of values to delete, you can combine them into a -stringified list and use the `IN` operator: - -```ts -const to_remove = [1, 5]; -await tbl.delete(`id IN (${to_remove.join(",")})`) -await tbl.countRows() // Returns 1 -``` - -#### Defined in - -[index.ts:488](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L488) - -___ - -### indexStats - -• **indexStats**: (`indexName`: `string`) => `Promise`\<[`IndexStats`](IndexStats.md)\> - -#### Type declaration - -▸ (`indexName`): `Promise`\<[`IndexStats`](IndexStats.md)\> - -Get statistics about an index. - -##### Parameters - -| Name | Type | -| :------ | :------ | -| `indexName` | `string` | - -##### Returns - -`Promise`\<[`IndexStats`](IndexStats.md)\> - -#### Defined in - -[index.ts:567](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L567) - -___ - -### listIndices - -• **listIndices**: () => `Promise`\<[`VectorIndex`](VectorIndex.md)[]\> - -#### Type declaration - -▸ (): `Promise`\<[`VectorIndex`](VectorIndex.md)[]\> - -List the indicies on this table. - -##### Returns - -`Promise`\<[`VectorIndex`](VectorIndex.md)[]\> - -#### Defined in - -[index.ts:562](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L562) - -___ - -### mergeInsert - -• **mergeInsert**: (`on`: `string`, `data`: `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[], `args`: [`MergeInsertArgs`](MergeInsertArgs.md)) => `Promise`\<`void`\> - -#### Type declaration - -▸ (`on`, `data`, `args`): `Promise`\<`void`\> - -Runs a "merge insert" operation on the table - -This operation can add rows, update rows, and remove rows all in a single -transaction. It is a very generic tool that can be used to create -behaviors like "insert if not exists", "update or insert (i.e. upsert)", -or even replace a portion of existing data with new data (e.g. replace -all data where month="january") - -The merge insert operation works by combining new data from a -**source table** with existing data in a **target table** by using a -join. There are three categories of records. - -"Matched" records are records that exist in both the source table and -the target table. "Not matched" records exist only in the source table -(e.g. these are new data) "Not matched by source" records exist only -in the target table (this is old data) - -The MergeInsertArgs can be used to customize what should happen for -each category of data. - -Please note that the data may appear to be reordered as part of this -operation. This is because updated rows will be deleted from the -dataset and then reinserted at the end with the new values. - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `on` | `string` | a column to join on. This is how records from the source table and target table are matched. | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | the new data to insert | -| `args` | [`MergeInsertArgs`](MergeInsertArgs.md) | parameters controlling how the operation should behave | - -##### Returns - -`Promise`\<`void`\> - -#### Defined in - -[index.ts:553](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L553) - -___ - -### name - -• **name**: `string` - -#### Defined in - -[index.ts:367](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L367) - -___ - -### overwrite - -• **overwrite**: (`data`: `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[]) => `Promise`\<`number`\> - -#### Type declaration - -▸ (`data`): `Promise`\<`number`\> - -Insert records into this Table, replacing its contents. - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `data` | `Table`\<`any`\> \| `Record`\<`string`, `unknown`\>[] | Records to be inserted into the Table | - -##### Returns - -`Promise`\<`number`\> - -The number of rows added to the table - -#### Defined in - -[index.ts:389](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L389) - -___ - -### schema - -• **schema**: `Promise`\<`Schema`\<`any`\>\> - -#### Defined in - -[index.ts:571](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L571) - -___ - -### search - -• **search**: (`query`: `T`) => [`Query`](../classes/Query.md)\<`T`\> - -#### Type declaration - -▸ (`query`): [`Query`](../classes/Query.md)\<`T`\> - -Creates a search query to find the nearest neighbors of the given search term - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `query` | `T` | The query search term | - -##### Returns - -[`Query`](../classes/Query.md)\<`T`\> - -#### Defined in - -[index.ts:373](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L373) - -___ - -### update - -• **update**: (`args`: [`UpdateArgs`](UpdateArgs.md) \| [`UpdateSqlArgs`](UpdateSqlArgs.md)) => `Promise`\<`void`\> - -#### Type declaration - -▸ (`args`): `Promise`\<`void`\> - -Update rows in this table. - -This can be used to update a single row, many rows, all rows, or -sometimes no rows (if your predicate matches nothing). - -##### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `args` | [`UpdateArgs`](UpdateArgs.md) \| [`UpdateSqlArgs`](UpdateSqlArgs.md) | see [UpdateArgs](UpdateArgs.md) and [UpdateSqlArgs](UpdateSqlArgs.md) for more details | - -##### Returns - -`Promise`\<`void`\> - -**`Examples`** - -```ts -const con = await lancedb.connect("./.lancedb") -const data = [ - {id: 1, vector: [3, 3], name: 'Ye'}, - {id: 2, vector: [4, 4], name: 'Mike'}, -]; -const tbl = await con.createTable("my_table", data) - -await tbl.update({ - where: "id = 2", - values: { vector: [2, 2], name: "Michael" }, -}) - -let results = await tbl.search([1, 1]).execute(); -// Returns [ -// {id: 2, vector: [2, 2], name: 'Michael'} -// {id: 1, vector: [3, 3], name: 'Ye'} -// ] -``` - -#### Defined in - -[index.ts:521](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L521) - -## Methods - -### addColumns - -▸ **addColumns**(`newColumnTransforms`): `Promise`\<`void`\> - -Add new columns with defined values. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `newColumnTransforms` | \{ `name`: `string` ; `valueSql`: `string` }[] | pairs of column names and the SQL expression to use to calculate the value of the new column. These expressions will be evaluated for each row in the table, and can reference existing columns in the table. | - -#### Returns - -`Promise`\<`void`\> - -#### Defined in - -[index.ts:582](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L582) - -___ - -### alterColumns - -▸ **alterColumns**(`columnAlterations`): `Promise`\<`void`\> - -Alter the name or nullability of columns. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `columnAlterations` | [`ColumnAlteration`](ColumnAlteration.md)[] | One or more alterations to apply to columns. | - -#### Returns - -`Promise`\<`void`\> - -#### Defined in - -[index.ts:591](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L591) - -___ - -### dropColumns - -▸ **dropColumns**(`columnNames`): `Promise`\<`void`\> - -Drop one or more columns from the dataset - -This is a metadata-only operation and does not remove the data from the -underlying storage. In order to remove the data, you must subsequently -call ``compact_files`` to rewrite the data without the removed columns and -then call ``cleanup_files`` to remove the old files. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `columnNames` | `string`[] | The names of the columns to drop. These can be nested column references (e.g. "a.b.c") or top-level column names (e.g. "a"). | - -#### Returns - -`Promise`\<`void`\> - -#### Defined in - -[index.ts:605](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L605) - -___ - -### filter - -▸ **filter**(`value`): [`Query`](../classes/Query.md)\<`T`\> - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `value` | `string` | - -#### Returns - -[`Query`](../classes/Query.md)\<`T`\> - -#### Defined in - -[index.ts:569](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L569) - -___ - -### withMiddleware - -▸ **withMiddleware**(`middleware`): [`Table`](Table.md)\<`T`\> - -Instrument the behavior of this Table with middleware. - -The middleware will be called in the order they are added. - -Currently this functionality is only supported for remote tables. - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `middleware` | `HttpMiddleware` | - -#### Returns - -[`Table`](Table.md)\<`T`\> - -- this Table instrumented by the passed middleware - -#### Defined in - -[index.ts:617](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L617) diff --git a/docs/src/javascript/interfaces/UpdateArgs.md b/docs/src/javascript/interfaces/UpdateArgs.md deleted file mode 100644 index 7a30a20c..00000000 --- a/docs/src/javascript/interfaces/UpdateArgs.md +++ /dev/null @@ -1,36 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / UpdateArgs - -# Interface: UpdateArgs - -## Table of contents - -### Properties - -- [values](UpdateArgs.md#values) -- [where](UpdateArgs.md#where) - -## Properties - -### values - -• **values**: `Record`\<`string`, `Literal`\> - -A key-value map of updates. The keys are the column names, and the values are the -new values to set - -#### Defined in - -[index.ts:652](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L652) - -___ - -### where - -• `Optional` **where**: `string` - -A filter in the same format used by a sql WHERE clause. The filter may be empty, -in which case all rows will be updated. - -#### Defined in - -[index.ts:646](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L646) diff --git a/docs/src/javascript/interfaces/UpdateSqlArgs.md b/docs/src/javascript/interfaces/UpdateSqlArgs.md deleted file mode 100644 index d979125b..00000000 --- a/docs/src/javascript/interfaces/UpdateSqlArgs.md +++ /dev/null @@ -1,36 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / UpdateSqlArgs - -# Interface: UpdateSqlArgs - -## Table of contents - -### Properties - -- [valuesSql](UpdateSqlArgs.md#valuessql) -- [where](UpdateSqlArgs.md#where) - -## Properties - -### valuesSql - -• **valuesSql**: `Record`\<`string`, `string`\> - -A key-value map of updates. The keys are the column names, and the values are the -new values to set as SQL expressions. - -#### Defined in - -[index.ts:666](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L666) - -___ - -### where - -• `Optional` **where**: `string` - -A filter in the same format used by a sql WHERE clause. The filter may be empty, -in which case all rows will be updated. - -#### Defined in - -[index.ts:660](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L660) diff --git a/docs/src/javascript/interfaces/VectorIndex.md b/docs/src/javascript/interfaces/VectorIndex.md deleted file mode 100644 index e1fbeab0..00000000 --- a/docs/src/javascript/interfaces/VectorIndex.md +++ /dev/null @@ -1,52 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / VectorIndex - -# Interface: VectorIndex - -## Table of contents - -### Properties - -- [columns](VectorIndex.md#columns) -- [name](VectorIndex.md#name) -- [status](VectorIndex.md#status) -- [uuid](VectorIndex.md#uuid) - -## Properties - -### columns - -• **columns**: `string`[] - -#### Defined in - -[index.ts:718](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L718) - -___ - -### name - -• **name**: `string` - -#### Defined in - -[index.ts:719](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L719) - -___ - -### status - -• **status**: [`IndexStatus`](../enums/IndexStatus.md) - -#### Defined in - -[index.ts:721](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L721) - -___ - -### uuid - -• **uuid**: `string` - -#### Defined in - -[index.ts:720](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L720) diff --git a/docs/src/javascript/interfaces/WriteOptions.md b/docs/src/javascript/interfaces/WriteOptions.md deleted file mode 100644 index 20be78ae..00000000 --- a/docs/src/javascript/interfaces/WriteOptions.md +++ /dev/null @@ -1,27 +0,0 @@ -[vectordb](../README.md) / [Exports](../modules.md) / WriteOptions - -# Interface: WriteOptions - -Write options when creating a Table. - -## Implemented by - -- [`DefaultWriteOptions`](../classes/DefaultWriteOptions.md) - -## Table of contents - -### Properties - -- [writeMode](WriteOptions.md#writemode) - -## Properties - -### writeMode - -• `Optional` **writeMode**: [`WriteMode`](../enums/WriteMode.md) - -A [WriteMode](../enums/WriteMode.md) to use on this operation - -#### Defined in - -[index.ts:1355](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1355) diff --git a/docs/src/javascript/modules.md b/docs/src/javascript/modules.md deleted file mode 100644 index d1796302..00000000 --- a/docs/src/javascript/modules.md +++ /dev/null @@ -1,271 +0,0 @@ -[vectordb](README.md) / Exports - -# vectordb - -## Table of contents - -### Enumerations - -- [IndexStatus](enums/IndexStatus.md) -- [MetricType](enums/MetricType.md) -- [WriteMode](enums/WriteMode.md) - -### Classes - -- [DefaultWriteOptions](classes/DefaultWriteOptions.md) -- [LocalConnection](classes/LocalConnection.md) -- [LocalTable](classes/LocalTable.md) -- [MakeArrowTableOptions](classes/MakeArrowTableOptions.md) -- [OpenAIEmbeddingFunction](classes/OpenAIEmbeddingFunction.md) -- [Query](classes/Query.md) - -### Interfaces - -- [AwsCredentials](interfaces/AwsCredentials.md) -- [CleanupStats](interfaces/CleanupStats.md) -- [ColumnAlteration](interfaces/ColumnAlteration.md) -- [CompactionMetrics](interfaces/CompactionMetrics.md) -- [CompactionOptions](interfaces/CompactionOptions.md) -- [Connection](interfaces/Connection.md) -- [ConnectionOptions](interfaces/ConnectionOptions.md) -- [CreateTableOptions](interfaces/CreateTableOptions.md) -- [EmbeddingFunction](interfaces/EmbeddingFunction.md) -- [IndexStats](interfaces/IndexStats.md) -- [IvfPQIndexConfig](interfaces/IvfPQIndexConfig.md) -- [MergeInsertArgs](interfaces/MergeInsertArgs.md) -- [Table](interfaces/Table.md) -- [UpdateArgs](interfaces/UpdateArgs.md) -- [UpdateSqlArgs](interfaces/UpdateSqlArgs.md) -- [VectorIndex](interfaces/VectorIndex.md) -- [WriteOptions](interfaces/WriteOptions.md) - -### Type Aliases - -- [VectorIndexParams](modules.md#vectorindexparams) - -### Functions - -- [connect](modules.md#connect) -- [convertToTable](modules.md#converttotable) -- [isWriteOptions](modules.md#iswriteoptions) -- [makeArrowTable](modules.md#makearrowtable) - -## Type Aliases - -### VectorIndexParams - -Ƭ **VectorIndexParams**: [`IvfPQIndexConfig`](interfaces/IvfPQIndexConfig.md) - -#### Defined in - -[index.ts:1336](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1336) - -## Functions - -### connect - -▸ **connect**(`uri`): `Promise`\<[`Connection`](interfaces/Connection.md)\> - -Connect to a LanceDB instance at the given URI. - -Accepted formats: - -- `/path/to/database` - local database -- `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage -- `db://host:port` - remote database (LanceDB cloud) - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `uri` | `string` | The uri of the database. If the database uri starts with `db://` then it connects to a remote database. | - -#### Returns - -`Promise`\<[`Connection`](interfaces/Connection.md)\> - -**`See`** - -[ConnectionOptions](interfaces/ConnectionOptions.md) for more details on the URI format. - -#### Defined in - -[index.ts:188](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L188) - -▸ **connect**(`opts`): `Promise`\<[`Connection`](interfaces/Connection.md)\> - -Connect to a LanceDB instance with connection options. - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `opts` | `Partial`\<[`ConnectionOptions`](interfaces/ConnectionOptions.md)\> | The [ConnectionOptions](interfaces/ConnectionOptions.md) to use when connecting to the database. | - -#### Returns - -`Promise`\<[`Connection`](interfaces/Connection.md)\> - -#### Defined in - -[index.ts:194](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L194) - -___ - -### convertToTable - -▸ **convertToTable**\<`T`\>(`data`, `embeddings?`, `makeTableOptions?`): `Promise`\<`ArrowTable`\> - -#### Type parameters - -| Name | -| :------ | -| `T` | - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `data` | `Record`\<`string`, `unknown`\>[] | -| `embeddings?` | [`EmbeddingFunction`](interfaces/EmbeddingFunction.md)\<`T`\> | -| `makeTableOptions?` | `Partial`\<[`MakeArrowTableOptions`](classes/MakeArrowTableOptions.md)\> | - -#### Returns - -`Promise`\<`ArrowTable`\> - -#### Defined in - -[arrow.ts:465](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L465) - -___ - -### isWriteOptions - -▸ **isWriteOptions**(`value`): value is WriteOptions - -#### Parameters - -| Name | Type | -| :------ | :------ | -| `value` | `any` | - -#### Returns - -value is WriteOptions - -#### Defined in - -[index.ts:1362](https://github.com/lancedb/lancedb/blob/92179835/node/src/index.ts#L1362) - -___ - -### makeArrowTable - -▸ **makeArrowTable**(`data`, `options?`): `ArrowTable` - -An enhanced version of the makeTable function from Apache Arrow -that supports nested fields and embeddings columns. - -This function converts an array of Record (row-major JS objects) -to an Arrow Table (a columnar structure) - -Note that it currently does not support nulls. - -If a schema is provided then it will be used to determine the resulting array -types. Fields will also be reordered to fit the order defined by the schema. - -If a schema is not provided then the types will be inferred and the field order -will be controlled by the order of properties in the first record. - -If the input is empty then a schema must be provided to create an empty table. - -When a schema is not specified then data types will be inferred. The inference -rules are as follows: - - - boolean => Bool - - number => Float64 - - String => Utf8 - - Buffer => Binary - - Record => Struct - - Array => List - -#### Parameters - -| Name | Type | Description | -| :------ | :------ | :------ | -| `data` | `Record`\<`string`, `any`\>[] | input data | -| `options?` | `Partial`\<[`MakeArrowTableOptions`](classes/MakeArrowTableOptions.md)\> | options to control the makeArrowTable call. | - -#### Returns - -`ArrowTable` - -**`Example`** - -```ts - -import { fromTableToBuffer, makeArrowTable } from "../arrow"; -import { Field, FixedSizeList, Float16, Float32, Int32, Schema } from "apache-arrow"; - -const schema = new Schema([ - new Field("a", new Int32()), - new Field("b", new Float32()), - new Field("c", new FixedSizeList(3, new Field("item", new Float16()))), - ]); - const table = makeArrowTable([ - { a: 1, b: 2, c: [1, 2, 3] }, - { a: 4, b: 5, c: [4, 5, 6] }, - { a: 7, b: 8, c: [7, 8, 9] }, - ], { schema }); -``` - -By default it assumes that the column named `vector` is a vector column -and it will be converted into a fixed size list array of type float32. -The `vectorColumns` option can be used to support other vector column -names and data types. - -```ts - -const schema = new Schema([ - new Field("a", new Float64()), - new Field("b", new Float64()), - new Field( - "vector", - new FixedSizeList(3, new Field("item", new Float32())) - ), - ]); - const table = makeArrowTable([ - { a: 1, b: 2, vector: [1, 2, 3] }, - { a: 4, b: 5, vector: [4, 5, 6] }, - { a: 7, b: 8, vector: [7, 8, 9] }, - ]); - assert.deepEqual(table.schema, schema); -``` - -You can specify the vector column types and names using the options as well - -```typescript - -const schema = new Schema([ - new Field('a', new Float64()), - new Field('b', new Float64()), - new Field('vec1', new FixedSizeList(3, new Field('item', new Float16()))), - new Field('vec2', new FixedSizeList(3, new Field('item', new Float16()))) - ]); -const table = makeArrowTable([ - { a: 1, b: 2, vec1: [1, 2, 3], vec2: [2, 4, 6] }, - { a: 4, b: 5, vec1: [4, 5, 6], vec2: [8, 10, 12] }, - { a: 7, b: 8, vec1: [7, 8, 9], vec2: [14, 16, 18] } - ], { - vectorColumns: { - vec1: { type: new Float16() }, - vec2: { type: new Float16() } - } - } -assert.deepEqual(table.schema, schema) -``` - -#### Defined in - -[arrow.ts:198](https://github.com/lancedb/lancedb/blob/92179835/node/src/arrow.ts#L198) diff --git a/docs/src/js/_media/CONTRIBUTING.md b/docs/src/js/_media/CONTRIBUTING.md index c8a347ea..881799ae 100644 --- a/docs/src/js/_media/CONTRIBUTING.md +++ b/docs/src/js/_media/CONTRIBUTING.md @@ -1,7 +1,7 @@ # Contributing to LanceDB Typescript This document outlines the process for contributing to LanceDB Typescript. -For general contribution guidelines, see [CONTRIBUTING.md](../CONTRIBUTING.md). +For general contribution guidelines, see [CONTRIBUTING.md](../../../../CONTRIBUTING.md). ## Project layout diff --git a/docs/src/migration.md b/docs/src/migration.md deleted file mode 100644 index e84ad91a..00000000 --- a/docs/src/migration.md +++ /dev/null @@ -1,124 +0,0 @@ -# Rust-backed Client Migration Guide - -In an effort to ensure all clients have the same set of capabilities we have -migrated the Python and Node clients onto a common Rust base library. In Python, -both the synchronous and asynchronous clients are based on this implementation. -In Node, the new client is available as `@lancedb/lancedb`, which replaces -the existing `vectordb` package. - -This guide describes the differences between the two Node APIs and will hopefully assist users -that would like to migrate to the new API. - -## TypeScript/JavaScript - -For JS/TS users, we offer a brand new SDK [@lancedb/lancedb](https://www.npmjs.com/package/@lancedb/lancedb) - -We tried to keep the API as similar as possible to the previous version, but there are a few small changes. Here are the most important ones: - -### Creating Tables - -[CreateTableOptions.writeOptions.writeMode](./javascript/interfaces/WriteOptions.md#writemode) has been replaced with [CreateTableOptions.mode](./js/interfaces/CreateTableOptions.md#mode) - -=== "vectordb (deprecated)" - - ```ts - db.createTable(tableName, data, { writeMode: lancedb.WriteMode.Overwrite }); - ``` - -=== "@lancedb/lancedb" - - ```ts - db.createTable(tableName, data, { mode: "overwrite" }) - ``` - -### Changes to Table APIs - -Previously `Table.schema` was a property. Now it is an async method. - -#### Creating Indices - -The `Table.createIndex` method is now used for creating both vector indices -and scalar indices. It currently requires a column name to be specified (the -column to index). Vector index defaults are now smarter and scale better with -the size of the data. - -=== "vectordb (deprecated)" - - ```ts - await tbl.createIndex({ - column: "vector", // default - type: "ivf_pq", - num_partitions: 2, - num_sub_vectors: 2, - }); - ``` - -=== "@lancedb/lancedb" - - ```ts - await table.createIndex("vector", { - config: lancedb.Index.ivfPq({ - numPartitions: 2, - numSubVectors: 2, - }), - }); - ``` - -### Embedding Functions - -The embedding API has been completely reworked, and it now more closely resembles the Python API, including the new [embedding registry](./js/classes/embedding.EmbeddingFunctionRegistry.md): - -=== "vectordb (deprecated)" - - ```ts - - const embeddingFunction = new lancedb.OpenAIEmbeddingFunction('text', API_KEY) - const data = [ - { id: 1, text: 'Black T-Shirt', price: 10 }, - { id: 2, text: 'Leather Jacket', price: 50 } - ] - const table = await db.createTable('vectors', data, embeddingFunction) - ``` - -=== "@lancedb/lancedb" - - ```ts - import * as lancedb from "@lancedb/lancedb"; - import * as arrow from "apache-arrow"; - import { LanceSchema, getRegistry } from "@lancedb/lancedb/embedding"; - - const func = getRegistry().get("openai").create({apiKey: API_KEY}); - - const data = [ - { id: 1, text: 'Black T-Shirt', price: 10 }, - { id: 2, text: 'Leather Jacket', price: 50 } - ] - - const table = await db.createTable('vectors', data, { - embeddingFunction: { - sourceColumn: "text", - function: func, - } - }) - - ``` - -You can also use a schema driven approach, which parallels the Pydantic integration in our Python SDK: - -```ts -const func = getRegistry().get("openai").create({apiKey: API_KEY}); - -const data = [ - { id: 1, text: 'Black T-Shirt', price: 10 }, - { id: 2, text: 'Leather Jacket', price: 50 } -] -const schema = LanceSchema({ - id: new arrow.Int32(), - text: func.sourceField(new arrow.Utf8()), - price: new arrow.Float64(), - vector: func.vectorField() -}) - -const table = await db.createTable('vectors', data, {schema}) - -``` diff --git a/docs/src/notebooks/DisappearingEmbeddingFunction.ipynb b/docs/src/notebooks/DisappearingEmbeddingFunction.ipynb deleted file mode 100644 index 0bf864aa..00000000 --- a/docs/src/notebooks/DisappearingEmbeddingFunction.ipynb +++ /dev/null @@ -1,764 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "88c1af18", - "metadata": {}, - "source": [ - "# Example - MultiModal CLIP Embeddings" - ] - }, - { - "cell_type": "markdown", - "id": "c6b5d346-2c2a-4341-a132-00e53543f8d1", - "metadata": { - "id": "c6b5d346-2c2a-4341-a132-00e53543f8d1" - }, - "source": [ - "# The Disappearing Embedding Function\n", - "\n", - "Previously, to use vector databases, you had to do the embedding process yourself and interact with the system using vectors directly.\n", - "With this new release of LanceDB, we make it much more convenient so you don't need to worry about that at all.\n", - "\n", - "1. We present you with sentence-transformer, openai, and openclip embedding functions that can be saved directly as table metadata\n", - "2. You no longer have to generate the vectors directly either during query time or ingestion time\n", - "3. The embedding function interface is extensible so you can create your own\n", - "4. The function is persisted as table metadata so you can use it across sessions" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "4c25eb9d-9e05-4133-927e-747516cb9310", - "metadata": { - "id": "4c25eb9d-9e05-4133-927e-747516cb9310" - }, - "outputs": [], - "source": [ - "import lancedb" - ] - }, - { - "cell_type": "markdown", - "id": "db4bd459-9bab-4803-bbe8-20201b445245", - "metadata": { - "id": "db4bd459-9bab-4803-bbe8-20201b445245" - }, - "source": [ - "## Multi-modal search made easy\n", - "\n", - "In this example we'll go over multi-modal image search using:\n", - "- Oxford Pet dataset\n", - "- OpenClip model\n", - "- LanceDB" - ] - }, - { - "cell_type": "markdown", - "id": "4ddd2873-0aa7-4869-bb20-21c85477ba29", - "metadata": { - "id": "4ddd2873-0aa7-4869-bb20-21c85477ba29" - }, - "source": [ - "### Data" - ] - }, - { - "cell_type": "markdown", - "id": "b36f56d3-0794-4018-a397-6a8f3e1b0050", - "metadata": { - "id": "b36f56d3-0794-4018-a397-6a8f3e1b0050" - }, - "source": [ - "First, download the dataset from https://www.robots.ox.ac.uk/~vgg/data/pets/\n", - "Specifically, download the [images.tar.gz](https://thor.robots.ox.ac.uk/~vgg/data/pets/images.tar.gz)\n", - "\n", - "This notebook assumes you've downloaded it into your ~/Downloads directory.\n", - "When you extract the tarball, it will create an `images` directory." - ] - }, - { - "cell_type": "markdown", - "id": "c5dae94b-ad78-41d4-aa45-06cb96a0fff1", - "metadata": { - "id": "c5dae94b-ad78-41d4-aa45-06cb96a0fff1" - }, - "source": [ - "### Define embedding function\n", - "\n", - "We'll use the OpenClipEmbeddingFunction here for multi-modal image search." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d4bcd5f5-29a2-4b81-9262-852ef456db9f", - "metadata": { - "id": "d4bcd5f5-29a2-4b81-9262-852ef456db9f" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/saksham/Documents/lancedb/env/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "Downloading (…)ip_pytorch_model.bin: 100%|██████████| 605M/605M [00:41<00:00, 14.6MB/s] \n" - ] - } - ], - "source": [ - "from lancedb.embeddings import EmbeddingFunctionRegistry\n", - "\n", - "registry = EmbeddingFunctionRegistry.get_instance()\n", - "clip = registry.get(\"open-clip\").create()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "de72bf3c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting open_clip_torch\n", - " Downloading open_clip_torch-2.20.0-py3-none-any.whl (1.5 MB)\n", - "\u001b[K |████████████████████████████████| 1.5 MB 771 kB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: regex in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from open_clip_torch) (2023.10.3)\n", - "Requirement already satisfied: tqdm in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from open_clip_torch) (4.66.1)\n", - "Collecting torchvision\n", - " Downloading torchvision-0.16.0-cp38-cp38-manylinux1_x86_64.whl (6.9 MB)\n", - "\u001b[K |████████████████████████████████| 6.9 MB 21.0 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting huggingface-hub\n", - " Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n", - "\u001b[K |████████████████████████████████| 295 kB 43.1 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting protobuf<4\n", - " Using cached protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n", - "Collecting timm\n", - " Downloading timm-0.9.7-py3-none-any.whl (2.2 MB)\n", - "\u001b[K |████████████████████████████████| 2.2 MB 28.3 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting sentencepiece\n", - " Downloading sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[K |████████████████████████████████| 1.3 MB 39.9 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting torch>=1.9.0\n", - " Downloading torch-2.1.0-cp38-cp38-manylinux1_x86_64.whl (670.2 MB)\n", - "\u001b[K |████████████████████████████████| 670.2 MB 47 kB/s s eta 0:00:01\n", - "\u001b[?25hCollecting ftfy\n", - " Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)\n", - "\u001b[K |████████████████████████████████| 53 kB 2.3 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting pillow!=8.3.*,>=5.3.0\n", - " Using cached Pillow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)\n", - "Requirement already satisfied: requests in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from torchvision->open_clip_torch) (2.31.0)\n", - "Requirement already satisfied: numpy in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from torchvision->open_clip_torch) (1.24.4)\n", - "Requirement already satisfied: packaging>=20.9 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from huggingface-hub->open_clip_torch) (23.2)\n", - "Collecting fsspec\n", - " Downloading fsspec-2023.9.2-py3-none-any.whl (173 kB)\n", - "\u001b[K |████████████████████████████████| 173 kB 22.0 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting filelock\n", - " Using cached filelock-3.12.4-py3-none-any.whl (11 kB)\n", - "Requirement already satisfied: pyyaml>=5.1 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from huggingface-hub->open_clip_torch) (6.0.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from huggingface-hub->open_clip_torch) (4.8.0)\n", - "Collecting safetensors\n", - " Downloading safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[K |████████████████████████████████| 1.3 MB 22.8 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting networkx\n", - " Downloading networkx-3.1-py3-none-any.whl (2.1 MB)\n", - "\u001b[K |████████████████████████████████| 2.1 MB 16.6 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting triton==2.1.0; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading triton-2.1.0-0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n", - "\u001b[K |████████████████████████████████| 89.2 MB 31.6 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", - "\u001b[K |████████████████████████████████| 56.5 MB 15.9 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", - "\u001b[K |████████████████████████████████| 99 kB 9.4 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting sympy\n", - " Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n", - "\u001b[K |████████████████████████████████| 5.7 MB 16.4 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", - "\u001b[K |████████████████████████████████| 196.0 MB 78 kB/s eta 0:00:011\n", - "\u001b[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", - "\u001b[K |████████████████████████████████| 23.7 MB 619 kB/s eta 0:00:011\n", - "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", - "\u001b[K |████████████████████████████████| 121.6 MB 93 kB/s s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", - "\u001b[K |████████████████████████████████| 14.1 MB 19.5 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: jinja2 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from torch>=1.9.0->open_clip_torch) (3.1.2)\n", - "Collecting nvidia-nccl-cu12==2.18.1; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n", - "\u001b[K |████████████████████████████████| 209.8 MB 5.2 kB/s eta 0:00:01 |███████████████████████████████▊| 208.2 MB 17.0 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", - "\u001b[K |████████████████████████████████| 731.7 MB 22 kB/s eta 0:00:011\n", - "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", - "\u001b[K |████████████████████████████████| 410.6 MB 9.2 kB/s eta 0:00:012\n", - "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", - "\u001b[K |████████████████████████████████| 823 kB 18.5 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107; platform_system == \"Linux\" and platform_machine == \"x86_64\"\n", - " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", - "\u001b[K |████████████████████████████████| 124.2 MB 43 kB/s s eta 0:00:01ta 0:00:02\n", - "\u001b[?25hRequirement already satisfied: wcwidth>=0.2.5 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from ftfy->open_clip_torch) (0.2.8)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from requests->torchvision->open_clip_torch) (2023.7.22)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from requests->torchvision->open_clip_torch) (2.0.6)\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from requests->torchvision->open_clip_torch) (3.4)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from requests->torchvision->open_clip_torch) (3.3.0)\n", - "Collecting mpmath>=0.19\n", - " Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n", - "\u001b[K |████████████████████████████████| 536 kB 14.2 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting nvidia-nvjitlink-cu12\n", - " Downloading nvidia_nvjitlink_cu12-12.2.140-py3-none-manylinux1_x86_64.whl (20.2 MB)\n", - "\u001b[K |████████████████████████████████| 20.2 MB 14.3 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: MarkupSafe>=2.0 in /home/saksham/Documents/lancedb/env/lib/python3.8/site-packages (from jinja2->torch>=1.9.0->open_clip_torch) (2.1.3)\n", - "Installing collected packages: pillow, networkx, filelock, triton, nvidia-curand-cu12, nvidia-nvtx-cu12, mpmath, sympy, nvidia-nvjitlink-cu12, nvidia-cusparse-cu12, fsspec, nvidia-cuda-nvrtc-cu12, nvidia-cufft-cu12, nvidia-cuda-cupti-cu12, nvidia-nccl-cu12, nvidia-cublas-cu12, nvidia-cudnn-cu12, nvidia-cuda-runtime-cu12, nvidia-cusolver-cu12, torch, torchvision, huggingface-hub, protobuf, safetensors, timm, sentencepiece, ftfy, open-clip-torch\n", - "Successfully installed filelock-3.12.4 fsspec-2023.9.2 ftfy-6.1.1 huggingface-hub-0.17.3 mpmath-1.3.0 networkx-3.1 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.18.1 nvidia-nvjitlink-cu12-12.2.140 nvidia-nvtx-cu12-12.1.105 open-clip-torch-2.20.0 pillow-10.0.1 protobuf-3.20.3 safetensors-0.3.3 sentencepiece-0.1.99 sympy-1.12 timm-0.9.7 torch-2.1.0 torchvision-0.16.0 triton-2.1.0\n" - ] - } - ], - "source": [ - "!pip install open_clip_torch" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "be5844c4-a7a7-49cb-b2bd-0253de814161", - "metadata": { - "id": "be5844c4-a7a7-49cb-b2bd-0253de814161", - "outputId": "7ea0aefa-74d4-447b-f14c-b8c6e389068c" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "OpenClipEmbeddings(name='ViT-B-32', pretrained='laion2b_s34b_b79k', device='cpu', batch_size=64, normalize=True)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clip" - ] - }, - { - "cell_type": "markdown", - "id": "ab96cbca-3dbf-4934-81b7-f8836e18509f", - "metadata": { - "id": "ab96cbca-3dbf-4934-81b7-f8836e18509f" - }, - "source": [ - "### The data model\n", - "\n", - "We'll declare a new model that subclasses LanceModel (special pydantic model) to represent the table.\n", - "This table has two columns, one for the image_uri and one for the vector generated from those images.\n", - "The embedding function defines the number of dimensions in its vectors so you don't need to\n", - "look it up.\n", - "\n", - "We use the `VectorField` method from the embedding function to annotate the model\n", - "so that LanceDB knows to use the open-clip embedding function to generate query embeddings that\n", - "correspond to the `vector` column.\n", - "\n", - "We also use the `SourceField` so that when adding data, LanceDB knows to automatically use\n", - "open-clip to encode the input images.\n", - "\n", - "Finally, because we're working with images, we add a convenience property `image` to open the image and\n", - "return a PIL Image so it can be visualized in Jupyter Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e2e7f4b-788a-4396-9c79-7e5ede47e6a1", - "metadata": { - "id": "4e2e7f4b-788a-4396-9c79-7e5ede47e6a1" - }, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from lancedb.pydantic import LanceModel, Vector\n", - "\n", - "class Pets(LanceModel):\n", - " vector: Vector(clip.ndims()) = clip.VectorField()\n", - " image_uri: str = clip.SourceField()\n", - "\n", - " @property\n", - " def image(self):\n", - " return Image.open(self.image_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "74b54c67-0ee0-47e7-8b72-97fec7ce4140", - "metadata": { - "id": "74b54c67-0ee0-47e7-8b72-97fec7ce4140" - }, - "source": [ - "### Create the table" - ] - }, - { - "cell_type": "markdown", - "id": "89f4b38a-4636-4ad7-b3ca-0aa46aba6afc", - "metadata": { - "id": "89f4b38a-4636-4ad7-b3ca-0aa46aba6afc" - }, - "source": [ - "First we connect to a local lancedb directory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f68cff2-0fdb-4748-ba4d-5e65e5a2b4f4", - "metadata": { - "id": "9f68cff2-0fdb-4748-ba4d-5e65e5a2b4f4" - }, - "outputs": [], - "source": [ - "db = lancedb.connect(\"~/.lancedb\")" - ] - }, - { - "cell_type": "markdown", - "id": "cbdf5a63-8217-4110-8aa9-42946c1a0026", - "metadata": { - "id": "cbdf5a63-8217-4110-8aa9-42946c1a0026" - }, - "source": [ - "Next we get all of the paths for the images we downloaded and create a table.\n", - "Notice that we didn't have to worry about generating the image embeddings ourselves." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "196dfc99-aa6e-48a2-a7ca-c1e2db1ac674", - "metadata": { - "id": "196dfc99-aa6e-48a2-a7ca-c1e2db1ac674" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from pathlib import Path\n", - "from random import sample\n", - "\n", - "if \"pets\" in db:\n", - " table = db[\"pets\"]\n", - "else:\n", - " table = db.create_table(\"pets\", schema=Pets)\n", - " # use a sampling of 1000 images\n", - " p = Path(\"~/Downloads/images\").expanduser()\n", - " uris = [str(f) for f in p.glob(\"*.jpg\")]\n", - " uris = sample(uris, 1000)\n", - " table.add(pd.DataFrame({\"image_uri\": uris}))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ac08735-602f-4dfc-be47-486265851ad1", - "metadata": { - "id": "0ac08735-602f-4dfc-be47-486265851ad1", - "outputId": "8c027412-1a1a-449c-c641-6b2db4c2cb92" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectorimage_uri
0[0.018789755, 0.11621179, -0.09760579, -0.0268.../Users/changshe/Downloads/images/leonberger_14...
1[0.021960497, 0.06073219, -0.1625527, 0.021481.../Users/changshe/Downloads/images/havanese_63.jpg
2[0.0074375155, 0.084355146, -0.027461205, -0.0.../Users/changshe/Downloads/images/english_cocke...
3[-0.01220356, 0.020815236, -0.08587208, -0.027.../Users/changshe/Downloads/images/shiba_inu_143...
4[-0.010112503, 0.14021927, -0.14588796, -0.046.../Users/changshe/Downloads/images/saint_bernard...
\n", - "
" - ], - "text/plain": [ - " vector \\\n", - "0 [0.018789755, 0.11621179, -0.09760579, -0.0268... \n", - "1 [0.021960497, 0.06073219, -0.1625527, 0.021481... \n", - "2 [0.0074375155, 0.084355146, -0.027461205, -0.0... \n", - "3 [-0.01220356, 0.020815236, -0.08587208, -0.027... \n", - "4 [-0.010112503, 0.14021927, -0.14588796, -0.046... \n", - "\n", - " image_uri \n", - "0 /Users/changshe/Downloads/images/leonberger_14... \n", - "1 /Users/changshe/Downloads/images/havanese_63.jpg \n", - "2 /Users/changshe/Downloads/images/english_cocke... \n", - "3 /Users/changshe/Downloads/images/shiba_inu_143... \n", - "4 /Users/changshe/Downloads/images/saint_bernard... " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.head().to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "09db8145-04b1-40de-8b9d-09eae5c77d24", - "metadata": { - "id": "09db8145-04b1-40de-8b9d-09eae5c77d24" - }, - "source": [ - "### Querying via text\n", - "\n", - "We also don't need to generate the embeddings when querying either.\n", - "LanceDB does that automatically so you can query directly using text input.\n", - "\n", - "The pydantic model we declared for the table schema also makes it really easy for us to work with the search results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d88c6664-638f-4c9a-be18-434e38168061", - "metadata": { - "id": "d88c6664-638f-4c9a-be18-434e38168061", - "outputId": "320dac93-abc3-4bb4-daa3-f1cd424076fa" - }, - "outputs": [ - { - "data": { - "image/jpeg": "", - "image/png": "", - "text/plain": [ - "" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rs = table.search(\"dog\").limit(3).to_pydantic(Pets)\n", - "rs[0].image" - ] - }, - { - "cell_type": "markdown", - "id": "93bed78e-0c3f-498e-afe1-344979a8ed5f", - "metadata": { - "id": "93bed78e-0c3f-498e-afe1-344979a8ed5f" - }, - "source": [ - "### Querying via images\n", - "\n", - "The great thing about CLIP is that it's multi-modal.\n", - "So you can search using not just text but images as well." - ] - }, - { - "cell_type": "markdown", - "id": "5898dbc3-57bd-4899-b825-b9b3d4652531", - "metadata": { - "id": "5898dbc3-57bd-4899-b825-b9b3d4652531" - }, - "source": [ - "Create a query image using PIL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8dc886e8-c6bc-4db9-a4ad-02fdb1a9b6e2", - "metadata": { - "id": "8dc886e8-c6bc-4db9-a4ad-02fdb1a9b6e2", - "outputId": "e0c86001-3f88-467f-8276-1b35591ab0e5" - }, - "outputs": [ - { - "data": { - "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAFOAfQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDx83O1Tjr6Vn3E5kbnNIZCRzUbZNZJCuQMcmm81IVphFaAA61agXOCarhDV6BdsYNRJ6DJ0XFKZfLekzUM5y4rO1xvQsNcbhgGqsh3UoGRxQRTSsRcg281PD1xUeOaVeDTYjTgUGr0acdKzoHxWjBIK55FosxwA9fyokiC0qTKtQzzjHBqE2UJkIanhudvQ1jTXmMjNMhuScDJrTkbQrnQvc7lPPNWodFu76ze6t4WkRGCuqjkZ5B9xVHw/aSarqkNov3pG2DNfR2h+HLTS9NjhWJfMCAMcVUYpBa584T6fLBI6SRMrr1BGMVQmhGK+hvEvhWz1EM7wqjEYEijB9q8i17w5Ppc7h0yoYhWA4I7VTi1qiWrbnAXUe1qgxWve2xYFgOlZYjJOBVp6CGhaXy6lETDnFKFxTuBCE5rRtEGBVUJ0q9boe1TJ6AacCDirWABVSHcKsF8iuWTAUgHtUDwb+1WAMirNvFnmpUhoyH0x3+6lV306VDgp14rrNgVOlTaZpT6tqMMCIW3PtOPoT/StVJl8iK/gjwRc6/qBLxssEbKWJHbOD+leyQeB9Hs7Vke2Qq0jMBj7o4AH5AV0OlaTbaJYLDBGqkgFiO5xUF/McBScCtdtxqKPK/Efg6A3LvZnaD8uz09cV540ctpcNE2QynkHtXuN9Hu3uOoOMH6da4rX/Dq3ZS5iAWUggj+8OtVyqS03Ikmnc42O4IHJptxcbkxkmmXls9rcyRMCu0nGe9VS2M5NZcthXKc6ksSetLBCzc4+lShPMcDFaNtbjaOKTdgRTFrx0qOa2wOB+FbJQBe1U5cBuetSmDKcUCjrUjRqB0xTgQOlDsCKoVilIoqAkBvanXMwVsCqZm561qldAi1xULnmozcYFRNMWNFi7jy3NNJqPJNLzTRDHcU9HKniowangj3uOKYrF+zcsc4rQJBFQW1sVGaklygrKSTY0VbnG2sadxk1fu7kKDk1iPKXYmtIIbEc5NMzSmm1qIcDzQaQCnYoENxRTwOKKLiLBhIqMj2rTmiC9qqtEM1lGdyioUJpY4CTk1aCZqZI+eKUqlgKpiwOlKrbUxV/wCzFh0qpPAVzipjNN2GM38elRsdzZ7U0A4qRRWoNiiilFGKRA3HtSqPmFLinhKQyeNTgEVMspWmQjKUrjArFq4xWucYJNQy3WRxVef7tQde9XGCC4SuXNSQZMgB6ZpgWr1jB5kowMgfeA6getW9gPU/hDpPn6691JHuWBcnI6Hsa90Dben/AOquD+F+m/YvDr3TIA8z/eHdR0rs3kw3XFQjaK0JZo1mQg9K5fWNKiuEMMqbkY9x0rpUkzTJ7cS89hVJ2Y2ro8C8QeFZLKciMExk8+2K4yaxeCViVO3dwcetfQ+t6es4cFc+tcXqXh2G5gdQoDEVq4cyvE5pXi7M8wjt9wximS2Iz0Nbr6c1s5VhjHX8KiaMYBx1rgnUlB6jRiGzI6Z/GrUKFcDFXjCM5pTCA2RUPEX0KGBCMUYIerCrQ0eTmhO4iHJUVLDdBTgmmSDHFU5Mg570JAnY3VuBIABya9R+Gej+WZL6Rc740ZPbqK8l8M2kuo61BbEna5IJ9ODX0f4dsF03RLeAKFKpj9c/1renHqap3L1y+ExnrXO3zln3BuAOp9f8mti7k7+lc1dSsEYYJXjGe5zWkmaJFdtryOpPJOB+X/66rTQCQxqRznGe2KlA4Lkn5AOfXHJP58fhUqMrkDI3EEfnUpg0cp4l0EToxCfvVXCn1HevM7y0e3vJVYHaHOM9/wDPNe83FoZrebON4HB9a838T6YpgMiJ84bBx6Vvbmjfqc01ys5aygDHcelaLBQpxVGLdEuD6Zp8twFjOTXM0UkOlmCqcmse4ugXpl5ehIic1g/bGkl68dquFPqJm6Lmoprk44qgk2QOaVn3Cr5REEkrO5JNN3EU7bzSMtaAMLZOKVeaaaVWxQxonSMmn7MClhYYxUzDIrNsZWZe9XNP+aQCqkny5zUtlL5cu4nvRuhHTJgKOKgu8eST6URXIK9aqXs4CH5uDUJDMG8cs5Uc1TMbgZK8VqWVqbuYtjjNbR0cFOFrXm5dAscfTgpNad/pxhbIH6VUjTtVc10KxEEOelLj2q35ORURjwaXMS9CID2oqXb70UriNi5Udqplc9quN8w5pmyuOMrIogSEk1o2ln5jACmQQFz0rf0q2AYE1jWq2WgxU0lEjBcZPpWXqdkqocCutnAArnNXlVY2zSottlJHGsuCabmleTLE+pqMNzXqIViUGnimKakApMligZqVAKaoqQGpAmi4BpsmcUBsCmM+amwyrN0IqHFSytk1GMVqloSOUHPv6V0fhqwml1APHjcjYIPTHesK2Aa5jH+0K73wrp88Guo4X5dwRxjg+tKWxUUe76BALLQ7WALtCp0+tSTSfPjtT0Pl26rjgKOKqSOC2R0rJs6EixHKc9avxMHUetYysM1ft5GUj0oTHYi1KDcWIHUVyt7AUcNjgrg129yu+PisC8tfMT6GtYTszKUbo4TUNMScygKMsB/9euV1PTntZ9pBCg4zj1r0OW3cXQUrw3H4ipdR0JJ5FUqDjn8CM061KFVGHK0eTMCnBGKaefrXaXfhEvOQuVTqfasW88N3Vu42qWBTgD1H/wBavOlgqkdVqg5u5kIuO9SMQo96f9kuFLHym+UjP41CVcgqV5IJH4U4xa3RorFeYja3rjNVo4TcT+X3PA9yela0dj9peyC7mWdtjHHQNgL+tdpo3gSVoovOhViGQspyDvXAP6Fq1UQ5bh8ONAkS6W4dTkhJBkdM8/1NeyyMIo8Dt0qlpukQ6eqpGBhcqD7Z4/SrF22BWyVkaxVtDNvJWNtM68nadvPftWBcyvkRxYa4kYBRjiNemTV/UbloraZIvv42pn++en5VlM8ek6X9ounLS7Rk/wATHpx9T/OobNEhk7i3jMYO9hhWPYe31NQ28nkxQGQ/MWJP5VjzXEs91GspK+VmSTB4T2H6CmT6gGjE6n/VNjr0wcdam5XKdaknmSKCcAjB/Cub1uyzM8fVTzn2rYt3LqXXOcgkHtkZpupWrzDd2IyTXRSlZmFSN0cFfaQDC+Fxxnj27VxF80kW5SDwT+lerSR5RkYdeK5nV9FDAeWvzkHBHr3NbTpp6o5k2tDyu7neVyDwKrqDnNXry1ZLlo1XndwBTZLZvtgt0XLjAwPXvUrYsbFnHA4HWpRmpZUWECBCGIPzt6n0FO8krGCRyT09qTERhaGX2pw6UpHH1pAVZFxUYHNW3QUxYhmi5SHwIauBDjioI0II4rQjTK9Kykx3KMsRI6VXRGBwBzWw0XHSq5gwScURkIrrM6DBqvdTsyEVPKcdhVGY9c1otRs6Dw8gMYz1rq44sp0rh9EuxG+0nFdnbXilBk8VnO9xrYztbsh5O4CuVEW2Q12mpXKPEQPSuPuPkkJFKLBjscVXlFP83ioHfNWkTJDaKQUVViDYxhealRMqKYRkYrQtrfcq8V5spWRYkJC8dK07OYR1AbE8MBxSriI7TXNPUC9c6igTrk+1cVrV880hUcL7VvXR+U4zXMX68muvCb6lGaZKVWqE9aUHFepYRdQ1OnOKpRvmrcR6VDJZOOBTS2KUnioHbg81NgJTJmo2fioCxz1pjMT3qkguPZyzUo5qMZJqeKLceTj607iLdsizuoKMrE4DLzzXufhGxEkENy2BIyjcMcbh3rxrS7PdeIu5lRiCHAzivoLwwh+wwbhGxK/fTo1Q3cuB1YZXiwww2KyZyI5CCMqa1nhwoKkjI6GsXUiYwWNYydjpirkqIrchsVcgYjg1g21w7HMb8dwa2LeTdjPyn8waSY2jUVtye4qOS3XB460sWT2q0iZxmtEQzLj0tGkZmGf4lqU2ahVyOQMVp7dq8YqB+TkdKtaEGNLaRhixH3hg1mtYp9oHy5CkflXQzx7lYAcgg0z7OvmBsDJ6n61am0S4pnPT6FA1sW2AZI59QOlYVz4TS6B2IVfjGB9Sf0rvTD5y+Wg71p2unxwom5QWUdabndWZPs1c5XQ/BdrYwIZFJIRAvsBj+ozXVhEjbIAHNTtgDAqldP5algay0WxokWhICT9Kz7+TMZwfc1Wa92g81XFz57lSe2KzcrmiViK5QGRZG7LvP1rl9ZlknuVVdzbTkKPXr+ddTOQIweBwP0/+vXn/AIk1WSyIgs8G6kGNzDO3PJY/n+Zz2qGXEiu54rSNbbIa5fBcA8A56n/ZH6n2rGkuoYRNbO5VJDsVm7uCG5+oBGfemRIQu1C0k0zCWaU/NhB90fj1x7iszWSwUMVaNfMKxh+pxnc38qSLsesaE8dxbhlI2mNGHOf4f8MVqypvt2TaQwGB/SuI+GmpC4jktGxmJMLntg/4GvSRGPN3eqdauJlJHBX6eXOFA6VSu0LWspUfNsIH41va1bFJHcL05rHQjADV205XVjknE87vtEkiL3hTAUHAPc9v8+1YbWkmnW8zcm9nO3gcoD/XtXsclmklvI6xiWXGI06D61ht4elW6iiSJZruaTc7kZUfQeg/zmpkrMSOM0fwpPPcbZ0IWNQ7fzA/lWdqyfZ5Zk3AuTswvRR1x9a9Z1dV0DRJ/JxJdOu0HHLOeOn515bfWE7yRRv98sWc+5NQN6GKmRIufWrN5GsbxovZefzqRrF2uwiKWy+B6U6/iZtQdUBPQD8P/r5oBFIJmnLEc08LzgCpAKlsqwsUWCCavxqFWqiGpA5rKWoWJmOTgCmMny0KCTUwjOM5FSBlzwk8AVm3MToDkc10yxDDMR0rKvUyxOOldFJXJlKxixOyMCDgitu11SQKFJ5rLmiwAwqNSVORVyj0YkzoJLxpByeKpy/NVaKfIwaeWJrLlsO5XkJQ49aauSae65bPWhVqwuKF4opaKQrG1EN0oFdNplmZAOK5m0y1ymPWvQtCiXapYdq86pGw4u42WxCQHjmuYv1ETZHrXeXoUQsK858Q3Yh3DvnFRCHNoXYq3Nwu3qPzrnr2YHOKJb5n4Bqk5LNk9a76NHkE2REUhFPxSYrqEOj61cjPFVUHNWUNSxMlJ4qF25p5PFRmpsISkxTqco5oAVI8kVaRNvUUkBZQVBA3DHIyDV2FPNYoyKNqliM4PHp71nJgSaZNJBdxsrlMHqK918Fai1zZIsm0MOhHevFrWJV2XUeHiDYd8fcP+0O1es+DL7NuqFFOOjDFJFwPTcgxDp0rC1RQ6Fc4I7GtdH/cjjjHWsXVBuGQcEd8ZrOodUDmGN9Y3XmwRbl6kbcg/X0rsNMkjuoVcDaT1WqWnQGTasqKpHTuD9DW9b2aJhkXaetKEXuVOSLUUeBip1GOKRV46U8rW5gNILDjrUTR8kY4Iqyo4+tKwGDTsBTWIZZz16Y/Co0gZhhQTxVzG5jjoTzUyIEHA60WENgt1hUYHNTGm7vekY4pgRyVRuFLIynqRxV9uAT61XZR80jH5QKljRzt5CyoxB+UZxWbbXOycE8Z4NdFdoGjKYySPyrltWg+x5kUYUnOQe/XArGStqaxd9B99fhF5bC5O4+grziaSPUL6a/u1Lg7n8ktw3OApPYdAT9a7DUt8tm7xDcwBJX+8PSvPdazp+mrLEd6XEgVM84A6A/+PH61K1ZS0FvtZv5mPkv5cYO4+WNnmseNx7hewFYs07vMZbiQs+MZLZP0FRTytFaNIXJaaYLk+ig/1P6VmSvjjkMD+daKI7np/wALVBub+dDkBgPpjH+NewxNuTHpwK8K+E14U1y8tGJ8ueIEZ7Mv/wBYfpXslreAxZZuQSD9R1pbMh6kOtQq8Dk9B+tcUx/eEd89K7+6Uz2p29SOvpXAXQWO7kCdM4HvXRSephURdtbgLyRnHQdq07KeG18yY4aZ+relc+jkdP8A9VWFlwMA118ikjl5rMpa7d+dOZ5NiQwj7x7t2A9T/KuW0/TJL28kvpkfywCQ8g49BxXUXFsLiUOyAgfdBrSQR21puYfOOEUDofWspU7FKVzi20iUXiokfljqS3U/X0rKurBZJysPMCjDzN0c9Tz6V2DQSTSbiPkGTLIzdf8AZA7Vi61FHOdjSLBZRDlV+9K3p7DNZuLRSkcgyq0vlwjKZOZMdfXHtVUVuGwmkV5pF2RkbRgY4Pt9KyLjZ5hCjao6AelZtFpke6pFfiowgpG471LQFhZcVZSTPU1l+YFNL9rCjrUuI0a4kHlle9ZtzjmqzX5HeoZL3cK2pq25nNDZh+6NU8VO8hk46CkC1pKV2KKshigg8danVjjpSKlTrGMZqGOxE1NyKlaPFRkUgGk+9FLRSGdDp6BSHPXNdrpd0qw5zjiuIgkCqPpWpaakIlKs2B2rgqK4RR0moakPLI3V5v4kuPPnAB4zWpf6qHLYauZu5/Ol9q2oQs7lNlYCkYc08UYrquIZtpNtSYo207gMVealB4pmMUAUXEyTNNzSE0lIQ6pFqEGpBQItRuEI4BHcHvVqPyJSF8wp6b+cfQ1nBsU4PUtDOltIr61uBdpH5in77J8yuOnOK9R8JT27ohRXRuhjYdPoe4rxSzupbSYSQSyRN6of85r1LwXq5uyqSCNpO7BdhP8ASoehpTetj1pJ2WId19fSq7J579OKSEu0YwK0LWDgEgAnriotdnReyH2tsqAfLj6VoooHHamImBxUwArRIzuPVeMU7pTPu89u9OJ71QhfpTGPOPelzTcbnAoGSRrtB9aVulOPy1G/TFAhm4hc9qcrF256AZqJuUpS23I56c0hjnOM5Pf9KgmbKYxkDtSscj9SajYl9qAUAQvCGjLO2BnJx371ha+E+wtHtyegUDJ+lbtyXYYjJyo5OQKy2gj2l5Srbly25uMdc/TgVDXQpM4hJJVCvIpEbAjnvj0rkfEsKCxuowS8QKzJ7FT/AIE16FrYE4ZoEby1GDIF7dgP8K831oy7J4WjdQVZdrDnGOM1mlZml7nIXrl4rVVyQAWwPVj/APWqlcOC5CnPAwauSjzN4Xj90qAfjVVovN1GOJO7qo98VuhHc6Vs8NaFbarjEzTiQkjnYPlCj3bIH4t6V6RdX8EWlpPBL8lxJuDA8hST0/AV5Z41uBH9h0mI/LbQea5HYkED9Mmlj1W8k0G1tIQ7SBQka+wxyfwrFxbVx3PY9W1uDT9JjhhbddzrtjXqR0BY+wyOfWuTlhKzgsSSRU2n6bItxHNdF5ZWVZJZH6DH3VA7DOT79TU12UaZtq8Dge5rWk9TKoUZDgYAwPeiN+MVHOWLY9OoFRqSOua9Cm9DimtS8rAkZ7USSAjBJY9gKrq/FSZGOgrQzK00pWPhAD2XNYc0TSTM7fvpuoA+6g966BoVckttAqheQI0RV32RfxAdTUuIJnNaldM0AjVss5y7H09BWPJblFJdeTzljit65eztUzFFukPQkVz9y1zduWWF5F/vN90VzygaxkU3YA5B49qhaQY6066jmwNybFA4HrVF2ZEJKnPQcdaxcGaqSCaT3NVGkOfvGkkZycHj2qIg5qkguTBs96cKhUEVMooESJxUq1CKduqWMsA1IGwKpebg07zh60WAsswqImojLk0gfNFgZNRSDpRTJLa3RROtV57x26N+VV2k461FnNZqCBD2kZhjP60zHfrThzS4qhjcUlPxSEUANBp4FM6GnqaqwC7KQrUgNGM0WFchK0m2ptvtThH7UgK+004CrIjpGiFK4WK5NPjVm6KSB1wKQrg1atnn3gRMVx/dwKGwNPStMXUGCnIx3/xrvfD/AITubeaOaNpE57tuH41z2g3M8bIHCSc4UjGa9a8P3InhVMDfjoylT+tZORtCKOk0+LMCK5IcD1zW1CgC8VXs4Sq/dI+tX1Qfj6iqSLbFC47U8L3HelAK98il6GqENI60h4+lPPb+dN4yPSgBOnHenR84pMcj2p4wBSAVjUEhxwDTi/PXvVWWTjIPShsESsyoMHnApoHQt1OaaF3N83AGCf8ACnucnbngcnFAETSCSUpHztGWP0puGzt3FQcAnvn0HvU6oI1ZUClnOTnufU0xlKDJx7YPJpAQOwSMjIz3Has6YKTvWEu5PDsA2e/APH4npVuUBlK4Jc/MVT+p/pkVl3smdqKuF/jRCCX9if8ACkxlAym5LyCLzACRvdyQufQAc/WuL8XR24gkBlEkuz7gj2Lxz9T+Jrsprq6lG2ODbEvAUgnj0x2rI1LRxcRSF7fDkAFSw59valYadjxUxMgaRmXMgIXHQe+fqf0q34WsY7rxRBgb44w0jZ9FH+fzre8Q6Bf29uwW3ZVAOGH3R/jVPw1ZvpWk3txOpS5nXy0J6gEgf/Xqm9CkyPWYGuNYaSVGJvHKof8ApkpAbHuSNo+jV01pp1vpEcJlJmuJFN3dkDhEU4jiH1Y7m9doHSuf1WRx4nsJtuLdItkXPpzn8zXQpeLJEMgEOyrg98dP15qH0Qzo7a/k+ymSU5d/mwT396qSXBJUZBZuMis5bnzZTHCdyJwzfzq0uwzKqndKOuBwtaU9DKepNeBY4hgc+tUFbn0rUu4h9n3Nu9yeKyBkPggiuuEjmki0hqVWqBD0qUfWtkzJomzmmNCjjkLj35oB96d1HNXcixXaG3A3MEfHbbmqNyYnyuFXHYVqPGHGMj8qpy2ZIwCqigRzV5EpyY7bcB1Y8CuZvoSzsUXLeg6Cu2vNILDc8gA9C3+RWBqFioXAYhPrkms5oqLscXNE6k5OfXFQhM9RWzcW4yQFcL3JFVHjjXgI5PqeK5WmjZSKYQCngU847cUg9qQ7jcU1qkHSmkVLGRc0YqTaKMU0MiI5p6HFKQKQcGmIsA8UVGG460UgK5JNOBp/kNtz2pjAjrRdMdhwapF5qtuwakR6TQywBmkK0qNTm6UICswpAcVIy54FKluxPNO4gU5NToOKI7fB6VMI/ak5EkZUdaULUhX5aFFK40PSMYzUciVYDDGCKa+KlsspeXuOMjPvVm2hKuBtBb0I60gj3twM+wPNaWnWsvmAGJxnke1ZyYjoPD5SWZYpLSI/mCa9m8N2sZgQRqyBRwDnA/A1xHhHTyZUd0JcHgkgDFetabEUiA+UAdgelEYmsdEXo4yoAx+NT8gdPrSKFIpwGMYJrUBFx3NO64pCQe+D703cRigY8jgimYpwOaRhjpQA3OKa0mB+PFBOelQSNhh/tc0rhYazEsdvr/8AWpqr5l0y87I/vH/P+eacn+rzjln+UUx28tNiNglizN15P+elIY938vOBuYncfduw/DinxrjGeXHLc5pu2NQoQFUUcknk/j/n0p5kO4BcKB07/p0/OmIcGLEAkKTnAzk/596ayHaWUjJJyWGTUFxciAZBUcj5mySfwA5qsZ3eMs4IXB4c+WD9AaLhYSeZYsgZK55xkhfy/mTWXPM88jBEZVA5JwGP8/51ZOnQyOJSIiAMrwGIzyead+7swRHAXJzhSm0E49e9TqMzFjuC+YppAoBJywcZ7Y5z+OKrXaPJGqNfEOeMSEYz6YI9fer/ANr1FkIlYxnOcBQB/wDX/Gs+9LzgRRTqBnkgZx9duOaYjmNcjvlhdksJLjaDsEUwIPvtHJz7YrmdN0+S4bdextbRrIGdGGF45xzz17V3jW0kYLtNJOcdWfOP0JFRCVwx6srDoG+ahoaZ5hr8gjntWO559zqSV27FLEgY9elWrW9jt7cSYiaQH5TNkqCevHeumnjsnaVVsnDyHcXZec+pOc/hVNfCkc6uzwBhnOUPl/jgZA/KpKuU7a5klJAZyCckqPmc9yT0FasWoRWiBcqp/uqcms/7DDFI0XmNKiNtYDKlT6Eetbll4c0y8gLQ3TxTnkLMcjP4VcWiJFSa+luEIUbV9zkmq6Ag8irE0AtHZCQ3bKnOf61XDAnjP4muiLMWi0nrUy8VWjPHrU6VqmZNEwI9aeCO1RgZ9KcAfxrRMhocT7c1DMjk4+UHuT2/CplYr0OPcUYz3FUmS0Y1zZeaCXYSHtu4UVjT6ZEMtJMNx/2s4/KuukhDDJxge1Up38kfuYRI3qSBQ0mScVc2K8lCxx3Gf5VjT2lxlgsTKO7yda7e+N6VBMiwBhkAAZxXL6gJBJgzu2eoUZJ+p7VjOKLi2c/IgjbbsJPq3+FQls5HGPars1tPISFjbB6ACqzWxiJ3kA+gNc7NUMFLigCnhakobtpCtTbaay8U0K5ARTCMVK3WozVAJmim0UgNtbZtnTj6VQuoCh6V1z2oSPpzWBqSAcDrmuSnO7NnE55hlsCpI1NSCP5zUojx2rpcjNiLxSjLcCnbKkiTkVPMA6KHNXo7celPt4VbBNX44QBWcpFpFLyAKiZAK03VcVTmXnipTE4lGQ0sYyKc6mrVrbFgAOpqm7ISREkRYgKCT9KsDTZHHQ/lW/p2locZBz7DNdDBo8W0Fi3I4ITNYudyuU4JNKKOGYNj1HGK67Q00qOeL7ZqUYPaMQliD7nFbcOh2zECYEn+8QBj6AV1uiaLYREPHCu/Od2Aa0gmNIvaO1tIq/Z4OD/EyYzXXW0QCKSo/Cqlpbqq4C4HrWnHwvGcVqkWKQ3bIoG7vTt9A5PGPypgMYZ5700Hn3qdkyvr9BVZsq+O9JgSDjmnE/LTccVFI+MAdfSgYE/MoOADUE+fKHHJGB+PFStyobHSmygM6ADuKQyOZtsQOSAATxxioC0ccKySHbwAM9u/A9elWnUZUHoT+gqhPKQWBGSjA5x93uT+lJiLBl2qZXXa+eFZslfr2H0qo2okuyxSY2nBYAE//r/Cs+41D7TIIB855GzP3j1/Id/wFEcrovl4DSg5ZVTdtH5gDI96Vx2Jplt+PMlkmlbIBeUnb+AGR+NMtp4hIVW0tsA/NvGGPv8Ad6Ux5JpF4mZih+VCgGMH0H16mnFnMSO4UMDwZcrg/lx7ZOKANL7WgU4IxnOA2049vlHFRS3coJzEuWHAMoAAHrVdbq7Q8swjOcNE2WH9P0q4lxmI5Jlx1JTn64H+FO4iqqSyH98WcdkjmDdfoP1qlcTRw7mKtBGnQMoAB6fe5zV1olkb93aBx0ZREOnseD/hVS4ssqWkRA6rgAvsx/hTEY82pSHYqTS8HOV4P4MwJFZUzDa8rQySENyRLn/0Ecn8K0ZbeJLgqzwSA+k2D9M9vrVXyUVZBlUznhZFc/Q8H+YoAqCSKZiylY3BxsIyT7EnvVkvd3ZRLWVFxhTOwyufoOB+tVJtPdfnhcKOMkqwB+hA4/Gr2lrsnDSzZYdGAOPzBOahlFmbT9SuLVYr+C2eM8GWLBPB6kccZrHm8MmKYyJuKk4aNBjHoRn+ma7mKC2SHc14hfG4JL8xH0xzWfdluDG8ZjVc4UHePXAz0/WqTEzgrqK7sp2gleKUKBlcgkA9MjHFV2IYfc2t7dK6KZEundpFjkIypkxlfxzyKz5NKbHyHGOxHFbRZlJFCMlT71ZRsjFQlWjO1l7/AIVNGoI64rZMzZMp96dx70BMcEU4Rn1wPWtEzNoTn/8AXS5I7D8qftQg4Y59T0FJtXHDMfwqkyWhhBbrk1E8QOcIMDk1Y2jqM0qqSpGQCfXsO9VcmxiXUMYViU5/n6Vh3V7YwuU8rzHHBCnp+ldjMqupVV/Huf8ACsW50xufKhRR396Gm9haI424vZZmZY7dY4/Vev1NZUqg/wAK49Oma6a8012BV8f7q8D/AOvWJd2iwkgEe5Nc04vqaxaM0g9lA+lIBg81N5QXnBPPFMIwxz171nYoTpTHOaeTUL0gInNQk09zUZB71RSAn/OKKMUUwPQrosiHcCK5q/YFjmu5vYFktyGAzjiuA1JHS4ZT90GuCktTebKiLk5qcKMUxMcGnluK0ZgxjAZp8WAajJpvm7TQkwRqwyBatLPngViLc5OM1oW0qjk8mhxNUzQHvUUiginhwaY7ACpQyBYt7gVu6dadOKzbJA75966iwiCqKipIcUaun22zpxkYrZjgz0J/Os2CTauRjA6k8Yq5BqVpGwjZhI5PGM8VhFOTLehoWulyzXIORGhPJ612WnWfkrgIqr+tZOkI8uHKhQT26GuqtkAUDvXbCNkQWIIOOTz2qzhsdeKVR04Ap2eOgrUCAjjnNSxjAyP1FGNzDgYqRiEGT+lIZBIwwf17VUEgaTGf/rVX1bWrOwgZ5bmCLH8Ur4ArK0fX7XVbgJFPDJIQWBibKsB6f4VDkrlJOx0ofgZ61E3ztkdR2p5G5RnrTYkO8k+nNMQjfLsBHXORSlfnDewpZF5Rs/w/lQ3C+3IzQMjcjaehG7H51zPiC9EJeLDYkTG8DonRj9Rkcd635GwGHqeorjvF1yEt3jD/ACudoHUE4xn+X60nqBTsNSELOIod88r7OucsB0J/z+ZrrLKIEr5rRljyQTk574X/ADgYryQeL4NFUyFovM/eAmVC2wsc7sDk8buKs2vxJ0jUiJZ9T1C1WTKtA8O1ByVV1dWBY/dJVuPvYxxh2sK9z1yWaBRjcWUdWx936nFQTE8MHDKRgjcRge3b+VcLY+IbCW8NvBPZ36MV+zTwzbc/KM7i3KNu4+YEcj5hnm7Y69az3ZtgZoZwCXgmzHIvtnPOD6fqKLisdPHPAgVmV0PqeVJ+ozj9KWW7iJC/Lu4Af19ielY8sqbxLbTOMnJK4Bx3xjg/Q1NEzMdzYB9Q2M/1/CgDVkSMREFS3JyiNnJ68d6z7iGLOBZS7QSd4G45/Pj9aiaXohnIDHB3J938ulXYFUruhdS2ckklmP0UVIzPksDcID5tyq46EA8fTOPX3qqIAsuxkdznG3BjPtzzkfSumLxiLDykjox8ogH2HvVJjbZ5tg3ORuiPGfbr+IpgZyafDIm6JQ/ZoWDH8M44/KqtxZRWd0sih41bor4+X/gXcfWt+Ca3kPmbTGem4jb/AD61Hq0EF9Bs2xPgbirrjI7EN2xQ1dAiNJbme2H2WZcjrgc/Uc4qrKzQZaRMhj82Bkg+oA5H4VQgvX0648l/NTOVAkQfd7YP8Q/I1pXcZaEXKIsiIAzgnblT6HqP/wBf0qbjMO+uI7ib92yq6nmQIFZcdsDH65qtGj7WIcOPTPX8K055LWX94Ip2K/KylQxX/EflWbcz20QBGCM4YEHePcZ/lWsWRJGNdRBJCyHHPKnqKbG2ewzS310kzYViQO7DG4duaiiz34NbRZk0XUJ9KeeR1+lRJgjk8e1SYA5Xge9aJmbQDAOM0ZPTGaQk+n0o68dDVpktDsqOvFOwgRd27LDOFGelMBA6dac6MgzMGUt0B4PsSOwp3FYjkkzwgIHpUDLuHNTMB0HXvUbIe36CtEzNox7+2LK2SqjtxzXIagTFJjZuPrjNegvDPKGWOJnOORtzWFqEXlFlFqqSY5DDmoqK4Rdji3ZurDBP6VUPWtDUEbecsyH0x1rL3EMcmuVm61HmomFS5yKY3SkBXZQDUTVK5qFjQNCjFFR7vbNFMZ6y2XGTXJ6/Gol3DvW82pI8Xy9cVzGr3PmPtHXqa4oJ3KctDKV+aeXzUAOD1pd9b2JHFveomOaaz0m7NCQCr9+r9uTnrVFRzmrkDgU2UjVjbio5ZgOB1qATfLhfzpvasrajbNfTj8wrrbLAjzXHaacsBXaWKZjA7msKq1NIskmZpAIw+M9FUcn8K2NHtI7cLNcRgeiZyTWfcXEelJ5iZa4YdB2/wrPtL6VrmGa/lUIE+SJDkj61VOFtwbPXNJWSRQSVAzwo7V09ugVR61yXhicXNvGQy7ccBTkAV2US4XIzj1rqQEgzjilNAI+tJ1zTAcBk+lNnjV4yOaASDUh5XrSA8a+NFg0dhYSKrSWxctNDjIbYytgf8B3flXKfDa+u7bxLaRxAPZPMzKYkG0sw2545wVUcdseua908QaNa6zY/Z7mNiVO5GThkb1B9a5LRvAGm6Xqsc9vF83mb2yxPI6frzWctrI0Xmd4o3ED9alcbFAFLEnlR5fr3pspJCgDk9/Qdz+Rq7EXEYjeqfnn0pk65Vh19B6e9KwXeZW7dDUUshDsFwW6kt2/+vQMozShC7Hqc9PwryT4haqbSdipcktkgdMnp7fj/AI16L4k1CDTdMnZpSZQpYYGQCBySc8n279PWvH9TifXWnlm3MygsqKAuBgnA465xn8aUdwex5rqt+9zIQT1Jzz71eN3Pf6fZQpd/u4IRAYWb7oDFs89jnP51oWXg+5urgRGMu5wcIOWHfA+uOK7nSvhZaFwzTxpGrou6SPerbsfMAemeQD69a1lJIhI87exE6JHBl7iQgKYxkL6np6dh1r07w98Kr+70yK41PU7y3njbMUEmfMjAHy5Ycr9BnA44Neg6H4F0rQ/NlaFZ5pNu9pfTH93gDkdveuhuSkUZG4xqozgLgYrK7LdjjYtMv9LVhcTyXQAyHBwSPUtjn8easiLzIPkuVgkYkiNnO4f4frU19cW084lLJKy52nzJMg9sc4/zxTdHtUn3u0GSflXqmPbgjj60rBcgtoriO4CXRcgHg4XaPfmujitSYySAzN9wEAb/AMf/AK1RvaJjEaopJ4y+Rn2BJ/pV+xAgjHmYXsCAOfaiwXI47FThsLu67k6/TjrU6RLGhQY2D/lmRkUye6iZvkk3N04bJ/DFLDHcyqC7oxXkFGKk/hnp7UwI3SBziJtr578fmeh/EUqW4ZWVlAZcnpgqfw/pSyt82GEoboGDHI/Ajn6c0i3Eiphgsig/eT+HHt/TNAjnNZsHVfNiXKjBOwgmMepH8S57jPuKq2GpsIP9XjGchT8rDuO4/pXTtLCwZCQf76np06j0PNc1qNiLK5eeHCKeoAwD36dD+OD9etK3UdyrdxRyI6pcMkh+aJz0x3U4647fiKwL+S4EhjnKvIo++o4YeoP9a23EU4KqRk/Mqnj34NYWorIchstjt3H+fSriSzLlkVm+7gnqQRzUkOe1VmBDDnIPpVmHpWiZDLaEfjTwrA8Z/CmoTjP6ipQSRx371omZtDeR17+lLj3AHvS59xRjtn86pMkTOOQefUdaYWx3O40pPYYpnXrx9apEsCD6c005FKeKTIq0yWhjklT8wH1rNvSQmQob3WtMsD2qCWEEEr+VXa6M2cTqS+cGBiw2e54Nc7JHHGxBZmPfbzXfahAefl2r7CuQ1GyVCWjckVzThY1hIzDICMKuBUbvUpiJ6jB9arSAgkVkaEMj81ESTT3GaQLTGNC0U/FFMLmyLuQLtDVVlcnOck1GZKazZFYpCsRsxBppc4pSpJpChqgG9TUig/hSIvOcVKvFDGhyqKlQDPFQ5xT0f5qQFtBUh6VDG1THkVm1qBo6TjzO554Aru7WX7FbgkL57jgddg/xritIkWzIujgtG/7tW/ib1+la0d8XXczkseST3NLluyk7F3ULnbGxzlm43HnFYVuEubwhzkFsiR2Izj271JfX6rG2RuwOB6modEinvL3eqJngb3G7HpgdPwpqFkO92e/eERFHYoE/eDjDquF6dBXZhiwGfy9K5DwtaiytIvMkd5WGSztkjP8AKuujIKj0qo7GhIOlJk0uRimknoKoBwPrT8en51EvX2qVeRQBBInWmxRAEseBUr8nGaTLDvxSC4SnJK+vpUTEAg4BGOcUjMMk5wR6VCWZ8AKAo496YDieBtAOOR6CqjnaxOd7n+6hcn3wP8alllWMrGWcseiqmAfaqd0+2EEu6+vOc/h3qRmZe+U58i+aHBBx58eRx3HG0Y9vWuIudPERLZgMMjEfupAwGe/HbnoTXXyTKCApt4Tvw7SRBlduD13rtIz3HpWPcKbhsI4JGflidWBB56HnHf8A+vTQjn7e1WK/hdiwZSoLE/Mvb8cHB969E09ImgVd4BA2gk7gCOcdPu5OP+BCuFcmOTaPlcYLIw5YDgfjWxpOpcFZDgY3FgRuToN2c9AdufY5NEgR1clw3GfMjOD+7ViCpzjIPcEge4P65t3duwkIlMqgDIkUMPY5AyD9MVObizaELOrRBmJUkEqrnrjuvPUHj8gaw9QuFgPzbsA/KHb7v0YcH17fnSQEMss11IU2SJk8hGZg35nj8/WtXR4EQ+W0jYPBUtkE/XP6Vyb3KOcKSUb7xDBz7Y/XvW/pETywbklmMYycHcq+pzwcUMEdYVYpiA5z13AAEe2CD7UIoi3l7dcEdW3SFvbHIx71XhgQoqtsdW5V4QSv6HmpnhRUC/I46YZiP/r0hkUzxwt5qQMZP7uVDH2AyM/n/hVmC+guiwCsrjtINpH+fpUYiVIg7KoXsSd361VuL4wXKp5kTJIvAZQCD2w33W+nWlew9zVIAkWMvyei7+ufY/0ondI/mVNrEDcVX5lx6juPeqSR/axtuYw3OVXj8wajdElG9ZGdOjMrE7SOO/Q9j69DzincQlxEtw2SZIZ1/jiOQQe+On4Vh3eougNvdRhjnBIHBGeD7HJ/DOK2GgY253EnykIDqnzKBzj/AGl9uvpWPq0DSR4bYZmUNFMGJVycYyR1BGQe/SgDEvLaS0lDQkHPzr3Vge4/z61WuGW52zgEZHOOCD61oxTGeMBWY7m3KJBl4pAOV9z16cNjPXNRMkaXKuAEhmOSAd2w98evr7j8aqImc9LF94kAgdfb8P61FEu1sZ5HUHitu6s7ecgKgjuMHaCMrIB159en1yOmDWMXZGCScqBhT1x/n/IqriLiBvwqQKzHA59qhjYnoKsKxI6A/WtEzNjCGBORz0o8qQgsq9D1I6GrDuibfMTIIyCGyR9fWmtMrMQ0W5OyhitUmS0QGP8AiJCnuCelRsDn5Tke9WfkcKqM55wAy/MPYEfeFVSACQGB57VaJaEJYUw5PQ08lge1NJJ6gfhVpkMjyQaXPeg8cYpK0TIaI5rZJl5Az61zep6Tty0TEk9RjNdRmmPGj9VFU4qRF7HmV1YlWOHyfQVlyoy8MDXpWpaJFcqWjXa3sa5S80SaMncnAPrXLUpNbGsahzJXmjZitCW1eM42BB6mqzJj3rGzRpcr7aKkI56UUxhjNOCUBCB0qRFJPSouNiCOl8qpxGcdKcF55rOUySJLfI4FRywtGMlTityyszKRtHNacujFoTuAPFZ+11LSOJZu1Ir4Iq1qNi1rMeOKpYzXQmmhl+Js1aQ9O5qhC2BU/mkc5qLAXnuB5oAPyqMCni6ZAef1rJeTmm/aHxgHP1qkiWajXHmEfKCxPAPH4mum8PXUdrOiqjSzEjleC3so7fXrXCea+7JP+FdToJaCVHVna7mXKkfwJ/e9ie1E9So6HvfhpnKqZWVpjjKofki9vc+pruIuVHeuC8Fx/wChoqqFTIwB1Y9znvXexkBfSpjsajicf/Woz7UhPNIcAdaoB68nnFSEgDOKhRsUPIADyMDuaEAjvjqRn3qKSQDheeagaXcSykntxUEjlt24soA5I4zz/M0ASeYzsyBsMTyR2+lMEuwqpZVXB+8/J9BVSW7Ee0NPEgPRUPUH8evXtTkRogy+YxVlOOQuPXnFIB88oji3kqeOSTgfXjPb86ypbh5FxsDNk5LdPYc/zxTJlTLmJiDgnadzBgc9vTPGOlYtxKylQGji24IJ4IPQg8nIx2PvTSuJsJpSrFVbY7YdQYlxjvj5iD6YB/LFZhkjmuD5kUfHOFOR69/mH1/Xmlubpm/dyojNkNnblX/D39eoqrGxaRgcbBjHcqB71Vibl1kjdA2FIByUkyxweRyOxGDnA/GmRwmKVXi8xfm3KA27HuD36nj3qBbiWJsgkJubCn5goPJx7HOcD609XkBLKrAexzj/ADx9aLBc2YdQmijH+kfKvy4C5R/YqOB+lULzU1mLkDdnAIC9frx+oxVU3BYkg4yMNgfKw9PY0qF2QFcYA6E5BHse1LlKuV0VZX3DIPuOv410ujkQup37SvBKt2Pr7e9YsR2MxGVZfvBkByPcHg/WtiwlgZwfLWNlHZiv/fJ549jUsaOrWJGQyMWIPUZLZ/r/ADpYmiZg0PloQCcMF5+jf59wKz4LmIL9nYFYyOc8lPqPTOOn1rVi8u5RgysJP40Dcg/3gw6/X86kok+d1yo2SDqVHDfh/SsnUI1KOJY0ZiM7w23jpn6A4rUCso2gncOhUYz+Hr7VDON42DbgnjIyAT29s9KT1QLQp2V2xj8i9QRMOVcHjr+g9+nrilvoyJGuElEV0vyybu/pu9R7/wA6ZdW+6FViLCeMbogD98jqv1xxg8HPoarQ6gHZYidpICxsQCBnkBu4HHBP0PrR5AS22qxzxEkmC4iyZYwc/d+82B6dcd1ORVfUo1l093g2r5bGXKcqCeSOP4XHzKexFVbyKVLiG5twyTq24bTjdjggepAOQfcipLa8+yR7Ui3I4JMSnb1OSE7DJycdj7E4YjnHmxdNgHMgyefvnqM4755BHQ81eQreweUCRI3PzDgt2b0GeQai1W0jikMlt80asWU7cADOcdeCDwVP16c0QEBDhAcNyM8+vB/p3poTKk5aNisiO6KQ0kBA3qSMB0PqMdOjDg461gTlPMwCvXcrjo49cf59DXW3K+ZEQJGG9OGVuRnkHHY/4dawLmG4jmb97lZCT7Nn+Rp9REEJ6dqtxoZGAUjdjgHvjtVOPryMGrKEjPAPFaIhkkjSDLKWweCBxz9Kr48zlCSc8+1aTNG0IJdWOFwcche4YdyD07/UEVXncSxje2JEUFtuCGUng+/arTJaK5O0BSwJznI6CkcMzFiSxPJz1Pv700kjoQR7ije4XG7K9celUiRpbHVePWmkKeaU89Dj6VGUI5yTVJk2FPy+9MJB7YNNLkNjHFKWq0yWh3bIpAwpoOKQ+taJmbRJu9ar3VolyhUHaT3qTJoBI6Voncho5DU/DtzuLRtuHuKwJtOeDiSNzj0HWvUeH4as++037Qp2Yz71jOgnqhqbR5oYGz/q8exorp7jw5eGUkFT/wABorP2LNPaI5ww1JHDlulWmQYp0IAkH1ry1NtHTY1tP0cTxjcuc+1GoeHnjTfGOR04rr9Cs1khUgZrcl0oSxEFe1Y82pXImed6BGvO8YYHGDW7IAR0qlfWTafdkpxzSrfBk5607X1QJWOe8R2yhSwHauQxg11fiO9Qr5YbJIrlq66SfKJjkbFKZOKZimtWlhCPJk0qmosc04NVWEW4dhbc+Si8kevtXY+FIpr67Ms2IYCeWH3mx2H4cVx1oN7j5cgHoBkmu+8NhxtZI2J6FtvyxD69MmolsNHuHh0osKLEoGBgnPCj0FdSpIArjvDZaKCN3JLPyqdTj1PvXYRj5RupI0H89aYTzyalOMYqMqBk4oZSFXkHaOO5rNvtVtoXEPnJ5h5C5BJrl/iF46Hhy1XTNP2vqs6hzuPywxnPzN7nBAHsT0HPid14j1C6eQ3EjvvzvION2fb+tVyuwr6n0Z/aNqI/MSVZMlQoU5JzjAA/GodQ1BRFIsSiQq20KsmASeMZCsPUfX3r54l8ZakX3vMXbGCHwwPuQev0/GpIPHMgXypkCx7crHHHhdwA4wG9d3zAqcBeTS5ZBdHu/mhpDtuGBBDuIVUgZxjKnv05z37dppLxZQqW2HjHLSM2xV+p68/QfrXm2i+K/PthLHclJVJC750TzQVx5h5w/wCGCpGDuHNTrrs0ewrLK/UOCR97uQQWA/zihJhdHWXl1LJvhmaPY5L5845+vA445rHeScMQZd6n+Lucd+mP0qg+oSyoWkkBJ7Iox+eP16+9NSRmYM7Nyf4m61qombkSOSzHcST6g8GoyY1yWXcB/GOfw4qQMCnGcD2qNlOcjP1B6/nTEORlibYTww4P96oXYFwyg7h1AOCPwqdXUp5ZZcD1/wA8UxzuHZhjkHt+NSMfFvEocAjI2kdh749DV6LAJZkKyD5uO/rWaGWP5cZU9AQaPtI6AuD15OcfjSuOxs29xCp2kAehB5H07fhinu8echUDEcDhcj2PSsI3yk4fIJ7g5zU0FwzrsaToOKljRqR6k8D4fLbeoJIz/n611WiX0EuPLm3KwH7s9Yz04PpXn08/3lcgEDhhTdM1B7O/iuInIZT8wB6j/wDXWT3ND16Qbhx94dD/ACqCV0cRy4IBO1sjO3P94fWksLgXlkjhgG2DJA7HoR+P8iKjuHEaTs+R5f8ArB/s46/T/wCvTYht0VjGXGVBGSeqehz/AJyCfSsW/ijMhmjbIDFSe6H0PqMjr7DuOdhWE8bwyZEkYKNzw6H+v+R1rAvVe1bczN5Un7qZgeo/hb6gihgOkkcWw+YrsPysOdv/ANb/ABAqveFPskcpQ7PvSKOQVyMlfRkYg+vI9aJJ/wBwpAUkhkkQHgOvBGPQg/r7UsbpsdVUGFyCVPOGIwev94DH1WgRnyne5DsDMhB8wj76kcFiOSNvQ88DHpSQRBwTjB+6yemDg/iMj86bOgUvb8CSIfJxwVJzx7Z5x749KktpFZm3KB5kavhhnDKP8Mj6cdhTQEFzDJ/ZbKjKGjbMbk8Z64+uR+p9awZCC7sEjRWbJCHgH6A8CuoMvlx+VO6mKRXiIIyHyAyNkcn+IHjP4qK527tY0lEkHyxNx5cjgtG3pu6NzyD3/Cn1AiUDOSKlUjaOv5VENw57+1SKzDnK/ngirRDJkRmP7vczegXNRyK+4b0KkHqwxg/WkywHU4PoaR2cfMHY/U1ZBFzyAAD6UmeehFTM4mGW++B19RUJOON36VSYmhpbH0pN64NBJIwQKi3fNgj2p3FYe4DqeahU8YNSAkcHrTMjPUVVyQHXg/hTueho4pTxnNWmS0JjNN5FLz7YpD7ZrRMhoXPPH60B/emnNNY4HSqTJaJvMFFV9/0oquYVjgSTimbiD1IIpGkzTN1eDGNjuPTfBt4s1ugbGa9AigRo88GvIPCN15TBc/xV6nZXytCBntWL0djRHNeKLJcMwHNeXatdS2c2BnmvXfEDK0ZOe1eReJyvmDHZq1oq7FI5+WV55C7kk/yptITilrsMxMU08inGmmgBhFNxUnFIRQFi1YvMjgRFtxzgL6V3HhoXdzcw2au7biHlycJweB74rh7cjgl9ijg46mvSfCIEMAYL5CM4XcRl3PU4/D8OaUtgW57FoMSxoNuSxGNx7CupiPA+lcto0qmJFVcKDhsHO0DsT69K6WA8AnioRoWeSAc0zliozwzAZ+ppWyxweB3zURlYsGXGAQQT3x7elMZ8zzeJbbV/EepXuriYi5umfZGFY7VbasYzxwqqOe2T16suL3w59kuCok83d+4dZPNRsEFsZAI69TzxwBk1W+IPhebRvFmrfYIy9rHd/KM4ZBIBIFI7r85APfHOK5SVysjxlSuVBweox2NaPViJLi5jYkIce55qnuJPDZNVZS+R2GOPepI3BRQAxf8Ai4zn0q1oQ9S5G8kYCxu43fNhTjnpn616RpErSWcZbfKxXJZj6/gP0zXn2nQyT3qR7cbgUw4xyQcf4j3Fehac8NvbouBGoXtwefoPrzVEs17d3JAJ4PHJ61Y81I9u9SewO3I/z+FV48oPlY8jJ75FKjrNkRY3A9Tg/rSAuK7P/qywbOBnmnvIq4EhyfTH9KrCEMq+cDn1V8c1YSVd6x+Y2exddw/Ejn9KllIUXUiD5WwBzjavH5inCWSTkiN89CYkOPx203yGJ2wMrsOSAdzD8Dg/pQv3sODgdcDHNSMa5xw0SYz1U7T+nH6VzmuavDpsZZHkDn7qYBz9CP8AAVo61qf2SEpBGs10R8kAcK7H/ZBPzfQZPsa8u1J4r+5dp7h7e5DYaG7i27T0IDL0/FRQlcdzorLxRd3M4+RChOCprqbTUPMQbsr3B9D61wOlafc2rRvOhKs2VcMGRv8AdYHBrtLVRtBHQDJFQ9xo03cjBySp+8OuPXHpzziqiSMlxjPHQ4pHkaNcHlcd6pLcE3oUd6VhnrnhrUwtmisT8qluO+MAg+xBz9RW9dFXlIyQGQnIGTkenrx271514cvDFKqOcAEcEexH+Fdpb3ys1plhkJzzyxTI/wDQTUsaGu0kJt5QACSI3AHBHbnt06/T602+Hnxbx8xmUEHHUjncR69PqKtajH52ny+SSGX50PcN1GPxFZ8E/m2ahsDGRgfwtwQf1x7celLYZhykB0IH3u3XaQCNv5dPXipreUorCTlCCpweD0I5HTnGPQ1Lc2p+1OcYEiiRP98c8fU8f8CpqxhYXUH5TtP4EZ/9loQitMiyo7rky259OXXtx3PUfl0pkeTGXVvnjfcpznjrn35H+c1K67bwsByflcD165H4Yb6g0tmJJbWVcDfGS7EdDjOSD9O1WhEcvlSSoknlxRSlopFY/KM5CsD2Kk5HTisWWSYRIkyAvsG7cnBHoeBxwf16Hmr+pc27MRmKUAfXv+fbNZ7Ss4AYtKEGAHz93OSOvHrweD0otqIrjAb5QQp7E9Pxp6lidoP4E0jqFfIztPqOlN3AkgjGe9WhMk4z0wRwaOgwckUHBwSeR3oUgMVGCOv1q0QyEjJ+UkHrjFJvI+9+YpzBW68MDwfeo2Voxu/hPpyPwoEDkEDn8ajI96Dz7Z70zfg4I6egp3CxKRkYzUfft9RS78nb3FDDJzj8qokAM96VXIOGxQOee9B5HSqTJaFP/wCqkyfofejORx1FNJ4watMloQv+FRMw+v0psjEE4FRF89qfMKw8gk8MKKaXb0FFO4rHnXm+9KH5rOExNSJKc8mvO5TqOl0i7NvcA54Nd3p2s4UDd+teWQz7SOa3tO1EZAJOcVzzh1GmdjrOrK1ux3dB615hq9w81wWOcZ4rorq4M30rA1SMNCW7irpKxLdzJ3ZpytUG+nK3NdNgJjTTSg5FNPNIBucd6M0hppPP/wBemBfsWLTAKilgRgt0HvXoGjyt58EUTkO3CdyF/vfXqcmvP7DO9XZ0jjVv48kD3wBya9R8J2llFOlzvuHkc7wLhFGQORnB6Dr0H0qZDR6rosYt7WBAu1No2r3wOSxP1rqIOEHc+tczpG642ysSfO+cn1XoAPb/AOtXRxScZ5xUo0JHJc7B/Fycf3R/jTHYAg4Gc9v8adHkqSR1x1NNmxt55J4Pv7CgZ5x8VPDl84HiXSIllntrdobyHaW3Q8lZABj5kyffBB7V88TM00hkDb3UbjznOOv6V9kpNiLbg7kIzt6n0x/KvL/GHhXSpWuZbazt1u9pYyRxgNEMkbiAeeQ5JP8AdPYVakTY+fipDELkjsDWhaRXWoSwWdpDl1GD5aDJyehPcknABPJIHcCte28NTxeJbeyktzLGT5g2sAuCSMEn/aUjnGSMcV2tpHY2ieSbKwRp12AxqEViSHCv6L8pX2DHOSAapysJI562s7S0QQFoHc7ykrnkRnHyOeRnjIK4/LeBpWpcRkTKTJuAXKg7sjIH19u/YmtO/wBMeWW5+W6ja2Pm25ThwpOWQf7Sn5wMgjD4xnNUFQmErB5RkAwzRj5HH3vlB6Hvt4HXbtOVRxkS0XYYyLgCKUeVjlBxz7GraSL8xABx1PGT9f8AGqiMbhlLgb25Pq5HQH39D36emLCYL+Yisrjqc4YdqGwSJQ32hHSUMFz19R/SrCny1AiVicYGMf41UyspfaWUjg/4j29fSrEaARjcSQORk9fepuVYmUkkbzjPTdjFZ+ua9BpNtvmAlc/dQt8x9wR8w/OqGveK49MJhhMctwMh1Y8A+/8An864q5urPX2aSWVopWPIcgqv9APf5fq3SmlcRckvLLxFDJNLbJLPHxJBJL5b4zgMj4weSAQ2OSCB8xwtpawaiXt/Pmcx4UQX6iO4iOOFV2+SQf7LY46betYsVpeafqkEaxbySy4b7sikYZSfcZB4zz0rsbHT0tY0gIeeJB+63t8yqeRg9vccjI6U5abAhIrB7GRkXKFj82UIDY7Mp7+x5HbI5OvbKpxwsZI9flP09Px4+lNt2ZVEUw8614G3OCnpgn7p64HTr2qYwtBz9+F87HAwGA9uxHcdvoQazLIriM/cIIIHeqZtwCrjIYH9KuyScbGJKKeD/EufT/D+RpkREhKk52n8xQI2LI+XMJkPcEj+db/2xokjkTrG+4D+YrnLA7SyN0xjB74rRScNE6Z5ADZP06/oaJISO5F0PJX5t25Rn8D/APXqoiGOWVSQEZ1PHOCWx+XI/wAisK3v3EMa5/h6dcYraFysjsMHGxWwe45/UFf5VDKFvY8IjLkmWIqB3DL/AFxUM7COGOZgQrBdzAcnlgf1qxM++SYM2VjwT+I5/XtSX8YFuE2j5oC0b56N1x9DgfpQBTnieC5iuF2tgLgryG44I9aisXSFZAMeWXDROenPAOfqMH6j0qzAgaxw2M/vNnHBA5A/LFUrcD7PNbqcqJCpJ5IDAYb+WfX8KoRFMEa0WB0Ko0zqc9VOMgj8wD/hWJOqwzAqxIIGc8Ecf55rVvnJ08sZFEkZTzBncMjKk5/FazWYSRNbvj93jy2UdM84+nX6H2JFNAQuME5FQgfMVOeOR7VNndtBP3uPxqFvlZW9ufaqJHjsR+tKR3HFNA5wc7T19qQ5yMHJ6g+oqkJiyFTknr3qJwMMM9e/rSyMQdwHJFQsxwp4x39qdxWGYKsAT+Xek5U+oPrUpwwHUHtimOOnpQAzo3H1xUqsGX0NIRuGMcjpmkHByPxqloSx+NpDDp3oPBHcU/8AHgjpUYIK96skCAfTNRyYweMEU9hng8j1qF+O35UxWIWJzxzUZyT6U5y4+lR89yKVwsOC/Sik4op3Cx5JvINTI+aiZME0inBrntc1LqP71bt7gxuDnpWfG1TDpUNXBnQidZFyDWZqU4KFAetUhPIgwGwKictIeTUxp2YFbHNPApdtKF9q1EOWlxTlWnY9qQyEimEVYK+1MK4pjLOmIXnBVcsDy7dF+nvXpnh2NpZwHOxCdpJHzSZP8q82tJTvT5yiA8BR3+nrXo3hB8XQfaSIM7dxyS54/PmpkCPYNPmCCOGPHmPgEA9B2+ldBEAqAZyfX1rnPD8JjtxLMQZZOWbuf84roEbOKhGhYBwox6VGeSSei9Kd0prD5T60MaKkysSXRsHGAcfhXI+IY2ltGW4tXaJ1KsEztdSMHGMYJUlQSfl3fWu1ZeBnnGCfwpxty8KpjkcEHoQeo/lU6laHm+meF5tevbm7uzNCZm+ZwPlYo7MFweSNxY+ue4KiovFUXhzw1LaWd9ciS+ZUEgBLkKuGV5BjHBUkA/MQDz1z6ZdQ20On3IniV4mj2upH3weNp+tfPfjfT7ZdSluBO8s0jAtuyzHjaAM89MAZ7Ur2eppTpOabXQkm8RWsskai5T/RWMYZhkTRAsiq/rt+ZQ3Xa3bdioRq1pJJ5ElwEd2ByB1YjJ/E4P8AwLkdTXP33he9sommuIwPMUl+emf/AK+Krx6Fezyr5sEiyNGiH5SGGPuk+/StE13JdKXY7WO8s5mCi7QSYyyg7TkcEr+IyPr7VONRsrgIY7iMzSgbdg++TkEY9ypx74XuK4670Wd5I5JEaKYrtJHGMHt+Z/St3xNa2lnZ6ZnbEGEoB6dNhwP5/UU07ilT5XZlhtdtyQYQZWB+Ujofp+H9a5nXvFFy87W8ACxg7g6HPzDr+vUf0qDUdXzbCexHMrESzbcbX6naO24fNn/ex0rnshN20DB6jHBrRLqzOTS0Rea4hvo0S6DLGeI5+WeE9drd3TuP4h26EGs9pcQ3SwInznlGQ5VlPG4N0Kn1/wD1VcstHuLpT5reVG4HGMt6g/59a6/StIjtoViCkxjJBPVSeuD745Hf60OaWxKiyroOlSW0QW4Yy5GCh5AA6Ae3+Riun8kJaJsUbQSoJHbrj8806OKNEBEfQdRSeacOD9w7cqef8movcZAhZDkDPGD7+2KQzMm7ABQnJUH9fr/+qlkcwtkAlcA9M8eo74/+uDzVd3MrBo/vDp2zSAczc4yDg9R3HUU6BWI447CiIF1AYcrx07df0qwE2jIGMf1pgWYnCAuevWpklCPuB4xtNUI5CRk5A3YPtipEjbc6How4P0pCNWOUqSAeQ5wBW1bT75ImQ54BA9j/AJ/Wucticbm+p/AVtacxSSLHWP5VJPX0pWHc2hOHluHVsbo9xz2wR/U1NeENDHEcAGFojgZI3L8v6gj/APXms20bLq3IZ1II6dcZq68mLu2xyQNjAdsY7fUD8yO9KwENjNmzYlN375JDg9QzYbHvxn61Qk32uouwkAWRNoccoSPlB9wRgEH3qa3KwReYhOEk+cH+JQOR75DkflUE52kJlmT5mQsPcA5/IHPYimgC48s3Vy0ZGy5iBAKghGBUFcHqpxjHbjrxnIEeJUzjBXy2AOcY6f8A6/61cuWA/eclZD5mOmD0YfpTLpd9zJ/fJz07nv8ATPX3Oe9UhFCUEIrY5B5H480kg2zSRnDDqCO/erMqh0JXuN35/wD1xVWQghWB5Ax+XSqECjcQwOOxz2qNv3btHjBB3AfX0qZevsx5FQ3GUlH93t7UxDLhQYA/Ud/aqwBZSDyParKnKEHpzioFXDYx1HBFAACfunqOQT3p7H5NwPB60zdkKw/GnjByvX0poTHEZTPpTdu5QehIwfrTkB2Y9OOaSPA3L07iqJHAkx89cVAeGyDg+lTNxgdPQ1FIBuyR171QhCx5HIPaomY4wRxSsWU/yNN3sfencViMtzj9KYevSnSAZzyKYQw70hi0UzPtRRcLHlrrntUe3mpX4JBqW3jByTz6Vz3sWQIcGpgasGJW6rVd0MbY7UXuAp5puM0c9aTNUhAVoC04HIpwxmkxAoxTsD1o4pjNjvSAk4phFNDil3ZpjNHSrKG4nQvdrCdw7E//AKq9G0G6sLKMfYhI4X5RK2PnPcKvuc815bFJtIC9+vPX0Fd94TgeeSORmIIIWMBevqR9OamRSZ7JpErvbxbsF2XczEYH+RXRQnPJ59Aa5nTZ0kXy42DYIDHryK6GE7Yweue9SjQt7vSkJz0601eeKGYUmNEiIuDu6d6mQ8D1qsjfifSphxzn60IZl+JGJs4UBIV3cEgZIYISP03V5RF4fN3rA1K6+WIP5kMR5aU/wk/3V746n2HX2LUbSPULN7aR2jzhlkXrGw5DD6fyzXkuq+NbCHUL60vJUhura4eKfbCyAsrYYrgdCeR359qiSd7nZh6kVHlbNiCzt7yQyvGJUiccEZBbqB+HX8qvWGnwz3M886JjzV5I6bRySfqetZ+meI9JfSBNZ3ULRFyVBcKUz/CQcHr079ucZrD8R/EDTbG0+zQXMcs+4lo7chsn3PQfj0qbO9jbnja9yv4z1G3a5HljKxsFyozx3PsABXF+KfEFtqSWNrbHetqxkZwcDd2APf37Vganrl5qckhnn2ROSfKj4X6E9TVNIJZF/do7AdSo6c8fhnFdVOnZann16qnK62JoTmZ7VX2x3AAVSejDmM/UHK/Qmn6XCtxdb+AqdA3ZuvSoTa3Ec2wQOkp4CFckH/HNdPoGlFXkmeJiXYkZAwO5A79c1b2MFuathanC/Lwe4OMVvQQOFA2o2PXrS2lrtXlNueuRV+cLDGARyeeOKxNCrIxRdo2hj2xVGWVdpx944788UtzcDOAxPuB3qmZw7EkFSecHpmmKwrTEjbjAB4z2P/1+/wCFLDtZhgAAHp6+mKI1Ge/PGD2P+FSxw4BK9jwKALUabWJx/FzUqIG3Y5B4wKiicbvU81Kg2KxA6MOaZI5YcCTgckn9KnEQBKj8DQeRjnGQOKkzgAn1waBBCg3Y7YA/nWhAf3eM44+mf8iqcXJJ9yOParMZ+T0IxigDUs3LX8ALYUglj+Gf6UglIvo2xlAxIGfRcY9jzTIlMbbwVPykA9O/P6fzqGVtkpZRyMMM+pPT/PpRYaJ5MJFLtPmIz7gQOuSSDjt6fhVNZAJ4gGBRXEYY8Bg4PP5ip2kVLedmJKLJ168cgH9Sf/11ntGPLaBivIIyTwGz94H6j8qQDnVjCiZ43MBgdM9c/wDAgaLkg7XGQ3DD29KlhzOZUbIkJLfRuh//AF0yZgsigcgDC/nz+mKpCGygRwgj7zxkDHYZOfyORWcRllwAcjGB9Kvy48kJ2Vjj6MP/AKwrPHJ5/iGcjtVMRJt/d5HbBFRz8kAj5Tg5qRTnCtjpTJRyB6cUxEB4XnuQDTE4JU/wmhzgFv0+lLgFuOjDBoAYBtJ9O1D5VQyjpThyo+mKTPy0ASDkcdxmh8Y3fgDQhwBj1xQwwmO38qskRjuVfUHmos5BB/8A1U8npnoeKifGCR1FMRG7MOcgj3pueowMe1SZ3DB61Bgg+uKQCtntUecDnNO3c4IB96Rjx0oAjyOxxRTs56pRSGeZ3cJSbGKW2B5XvW7dWBaQkr0qk1psPAxXBGupRG0RhGI6VBcJ84BrSUMF5Az61XliJbJoVVXArJDntRJYnbkAitSwhUtzWo9ujxkYFP21mNHGOjRnmkVua1NQtghOBWMflciuiMuZCZYLcVDI2BShs01xkU0BEJDnrUqOTUJFSR4HX8qoDUsYoS3m3DHy1PRRksfSu78P3TzXcSxRiNiDtQuASBz+AGPxrzuKaQuFGCCMbQK6/wAPwzyagtpCUF0P9c/XygPT3x/n0iQ1uewaBMsjx2UXzLGN0rqQQg9Pdj/WuxibJzkAZwPauP8AD8MFppyR2wPlsdzSjuOgx9e1dNFIEznIb+76egqEaGiG688dqQtkjioUk8znHAqQHkDtSZRNGO/apSajBAwB1x+VLnHHegBkoLqQSAuOh715t4w8AQ6vrB1ZIXl84lp4I5BFJI23aNrngAkAngnjtmvTDjvyfWqk0Hmhirc/4UXsM8M0/wAKaTZWd1JrejXJv181rdZZi8bNlTEhxgOxyy9hgEnAG483P4ailln+zxxRrLdNbpAh8x418tGDBuAwDHHYnDc9q97vrG5Eewor7H+U+2K5q5sHTbm0IKNnhejBgR+mR+I9KOdoOVHnem+Fgqq/kIPN2bJd4PlSFSUDZGPmB6H05zyDabw3BMXX7PFaM/I2KSsbHho2XdnZnPfK9AzAqD002nXn75Eim2n5QCDkjuP5MPdQeooh0rUrmQZgmdm4yV/iAwD+QwfbHoKrmFY5220C3DSMyKVA/wBW/JT1U8fMP9rA9wODWnBaqoGw8gY2k9fx/wAa6a08H6nJIu9RCdu9SW57frzXQQeErWzXzHbzH5wcYC+hx/Sk5gonIpClvB504VT/AAjuaxb+7aRjsXcv5V0GtWx805+hHp9K554yhOevf3rNSuzRxsjOLMuGIyD1I7U9lSVB1Vuo/wARUu3aXXtjIojj4xj3x9a0TIZDbljKVbqp4NaMe0FR75NVYI8SHHrgVYYYkPoOlMlkkEW1yR/eyfxqxGBgZ6Dk/hTUOMH1IzUkS5QHjJJpkjzlSpHUH+VSJhlKnu3+f5URrvt1+m3J/DFEPM7Ie3P9KAHgYOR/z0zn+dSw55AGT0/I0mwgMT03H8BUkOEcgfeJJX+v5f1oA17OImXJB+WIsPcsCP6VSuZFQs3P+s4YDPbI/Dn/ADita1AigE5x/qlPPGSq8isK8xueLghCuTz1wQf8/wCNDYJC3bhLeWBeQFVyc+vB/Rh+dQkZmyTu6ZJHbn+lI7+ZI5YcM2GJ5HsfyA/L3qSFd0nzccAfyzQgJrEEtJO2SNrDLd25z9RwKglIaVEO7cCeRjp1P4VctP8Aj2YjjDKuD75J59f/AK9Z5ffKrLkgnYuR2qkIbIwDZU5GM9OlUV4UeqnH1FTuf9YB/APzBqokmCCTx0NMRZGGUDoc8H0NRM5DtkZ54FSRcgKeueKhl5l4P6VSERfx7fQ/zojO0U1iRLu7E8VIBldw/EU0DEVf4TwRwaYM78diKf8AddW9sGmP8sZYdVOaBEgyCR7ZFJIRuA7HrTmxxjHqfpUUw3ISDwDxVCEbhdpyQeh9KgHBAzUquHTDcEVC42t14PtSAXlWyAcUHGeh+tId+OD9DTSdw5O1qYrDJgM7hwfamj5hzTpAeCVH4U3nbkAA+1IY0hgf/r0Uo96KAHS6fG6FlwTisK8swhJx0reju1KcN2rKv5hya+fimizGKgNg1HJDkU4vukzUuQRitbtCK9sSkmK1kkyvNZgjJkyOtXMOkfSm56gUtVK+WT7VzDn56278s/XOKxJeGxiu+jsJirTjyKiVuafmtgGEe1AHtT8ZpQvbFUMntDJ5myE4kf5dwPKj29K9I8P6OtisFlOfIlnXDwoQZG/uqfQE/oe9ee2cj27h4VG/+8ecV0ekzfZd1zKMkKfvMcvn19F9fWspMEz1rT9UjmljiQY2qXJAwpA4yB+groLOV5JNvYdT2J9vwry3wxeXOq3RkY7YywOOmVHb/PavVtNiWGBRgZIJwfepRombCHChe9SDg1DG4CbjyT04p5bPFNjLCt055pQ/pVcSdhT1bjNIZKzcUxD8vTmm5zSFwowDSGDDcWB5yMDFTLBFnGxeuagQlue/arCHCjB59TQgGS2sG3cUGBz09OP5VIsaIM7RuDk8DoQaQtuP86bux39qBjo8BmOOR8q49B/+uiVR5Zzjp3pF4Zv0ptw2E96QXOL1q1JkY4zzXL3NuVbOK9ElijmkKuOGBH5Vgahpg3uVHQ8jNRa2pdzjTB83THGMU1YsE4HatmW0KyN8p/8A1VXNvt6irTIaM4JtkH51KEBUe7VM0JLjinCPbNFGB0yx9sdKpMlojYbAPYc/hTh8kGeuFp7rlnbqAMD+dKY/9GbrkDH6U7isWLQ7wIwMhiuKsW0Cm+Ab7rYDnHSoLHKSjaSGGGUjsRyMflVyWT5vOjUfvSjKAehG3P4Z7e9FwsW2sQS4U8oUBxzz0P8AMfrVVrVkQSAjKuyKfUgZP6L/ACrVjZTPctnhCGyP4uQyn8sCoHRnACA7vMYgkZw4Xg8eu0/nRcCWGXbbQR4yofycDnKhDn9f6Gsi4BeVzIeqKCRxyAR/MZq5bOs9xIpXCeWZFC9QxG1lz9APyp8loZbxozwWCM2ccEbg/wDQ0AZ0MRfzhg/Ky/yqzFBthWRjjeRt78HnNWtLiElvcMwO4uIzgclcA/mM4/Krl/AqqsS4G1eSOAD2/RD+AqkxMxpn2QrCMDdktk9GPTn61QcmOfGcMuOD7df14qa7kCRtIG4Kbwcc46Lx9aoq+J55XPEa4HPUj/8AWOadxWGCQPHM45KgZ+nX/wCtVUEjHGRnnNIu6N5Y+QwbH+P9afEM7lHUfqKYFiI7efyqNm3SuRjGAKViEQEDAHpTV+V8dc8mqJIXJwv+NSqMKV9uKRlyOe3NPHLKT1IwapCGN/qh+dLjLnHIIpJhhVHvzT+oB78EUxDCQIwDwMYpnKkg+340y+m8i3kYDJzhR15NQWUkjxvFNnzIn25J/Gpc1zcoraXFddkhxkfjSliwzQXWRcqQw9RzmmAkPjnmmMcSQTjims24cjmoJ5nFzFFHwxOWIGeP85pn2hmnm2fcRDwBnJqHVSdgsWHIZQR0PWmL05IxUMk5ht1DANKy5I6fjSSbzII4yqsV3En9KHUQJEhIB65opIMyRBmTnocUVUdVdAcwmqMkfOc4qCW/aY45xUUUO9ck01odrda81RiIuWy+Y4A71uRaWrRhm79KyNOX94D6GuqtTvgGe1ctT4iytZ6TFlnPboKdNaxshwK0Im2BgPXNUriXDsAKcY3A5HWIvJJ4xXLyHLmuo8UTYCgDBI6/jXLd69KgrRJYgpwakxSHg1uK5IGp6nmoByaljG44oGaduq4BOSfTFa1uAIigjMks3XPpms+0TIC8HPrXSaYojcSbUL9F44BrjlK8hnY6BEmnQpPIu9ydlvAoCgdtx9+td9pxkZQ7tvc9MHgV574bP26/mu5Xci3IVB05PGf0Nei2iiCAswySvy47AVrEtGkrDcM8mh5R0yPb3rNn1ARRM5UlcbuOppbGRp0+0ydTkqB/CBVXKNUHAp278hVZZSVLdhSiUv07etIZY3EdKTGTk1GGx15pwO7k/lSGSr0H6AVMpAFQA7eB16mhpCD7UAWNwAOevqaiaUBifQVDvZj196By+KGwJ921ckn3zVeaUEKOMn+lMnlxETzxVSV2SIOxySAMdualuw7DGkCzlhyVGM/hz/L9aqlzInOe7fWrKwh35OQMjmpIrZTjPTkmkMzDZJIAmBuI6+9UJ9MYkhQT7VtHiTeBySTU9vEPKVjjk4x6YoQHIT2piYjbjB5qutuVkdyOQpH51193ZpLKu7GSST+XH86zpbBRLNtOBu4/P/8AVVCMFYBj8c0ghzER6kk1uPYCIN83II6VWaBRGx77gufwJqiShAoRyfQev+fWnrllhUkYV34PQAqP6ipGQRrK554UY+mP8TTxGI2Cg/Kkm0HueM/1pXAvwzLJqbhV2xvIivkfeH3T+mfxqOIlIVPUEZkJGOASrD9TTbLiJZCoLNKc89SM9asGNVt1HXMMcf6c/wDoX6CmBXtYGWcjBy8u5j0wC+Mfj/WtGAqdQilBDqwj8zjPB6n9DWbHJI2nSgP+82qxbGMlUZgf++lH61du5fLv0+RSztlRjACnt/MfjntyCE0RGaIRZCs8jEkc5woB/U0mrTg2tzL90yEgj0XO0DP0H6mksHaImQsSEV0P/ffJHv0/KsjUrlrq8ezclUe2ZiR/eDbs+2M8UwM2e6Ml9KkpIV2AA/ujAUD8M5qncFow8HXOScHqN/8A9ai7mVpmkCkESsCM8A7RjH4YP1HpUDNukyevCkgYyKLgIrb1PPOOKs24zIPUZB96qhdjN7jJFWEfYCR1PH61SEySUAx/Lg7eRinIu6UsPSjA2v7H+dLAMnP1/nWiIY0cnj6Cgd/YU2Bt0bP3GQKVW3ID61SZJHLyOe560/ny+OoxTDyoP1pS2Ldj64FNAUbwG5vo40kZUVfMOByDnAqnIHjjudrMQZdruTlsf5NaDnHzgDI4PvUFwkk6KEIwG6N3/wAmuecdHLqUiAu4eEQRssO0+WAfvH1NMuWne3RZEZWPy4AyWP8ASrMA8qMRZyB1/nVhc4LDtTVNuOrC5nb5Fu3aOP8AeZAwecLgd6ktQUiKmN1bOWc9D9Ktucx5HFRI3yk468VUafLK9xXK1yq7cAfO7gE01y0dwWVS2VAXA71YGCBxkgnbntQew9aHC7uv6/q4XGR5iiVOcgc/WinYYelFUlZWEf/Z", - "image/png": "", - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from PIL import Image\n", - "p = Path(\"~/Downloads/images/samoyed_100.jpg\").expanduser()\n", - "query_image = Image.open(p)\n", - "query_image" - ] - }, - { - "cell_type": "markdown", - "id": "d8ca1269-755f-4b8f-b86d-d18e956a7cb6", - "metadata": { - "id": "d8ca1269-755f-4b8f-b86d-d18e956a7cb6" - }, - "source": [ - "Pass in the query_image to the search API" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbefdd03-dc28-4827-a44e-27522689448f", - "metadata": { - "id": "bbefdd03-dc28-4827-a44e-27522689448f", - "outputId": "8598d21d-cdcc-4c0c-c136-ab6441dcafca", - "scrolled": true - }, - "outputs": [ - { - "data": { - "image/jpeg": "", - "image/png": "", - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rs = table.search(query_image).limit(3).to_pydantic(Pets)\n", - "rs[2].image" - ] - }, - { - "cell_type": "markdown", - "id": "193835d9-e9c7-41ae-b2e1-4edda989b87d", - "metadata": { - "id": "193835d9-e9c7-41ae-b2e1-4edda989b87d" - }, - "source": [ - "### Persistence\n", - "\n", - "Embedding functions are persisted as table metadata so it's much easier to use across sessions." - ] - }, - { - "cell_type": "markdown", - "id": "2732399d-7850-4d67-9f42-f60e1629861e", - "metadata": { - "id": "2732399d-7850-4d67-9f42-f60e1629861e" - }, - "source": [ - "For example we can recreate the database connection and table object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ed91086-cecd-4acf-893f-200b9e3099dc", - "metadata": { - "id": "0ed91086-cecd-4acf-893f-200b9e3099dc" - }, - "outputs": [], - "source": [ - "db = lancedb.connect(\"~/.lancedb\")\n", - "table = db[\"pets\"]" - ] - }, - { - "cell_type": "markdown", - "id": "b807fd59-48dd-419b-8c3e-705ccf63e077", - "metadata": { - "id": "b807fd59-48dd-419b-8c3e-705ccf63e077" - }, - "source": [ - "We can observe that it's read out as table metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53156aa2-be02-4c80-9a54-b66f9f0ee7b8", - "metadata": { - "id": "53156aa2-be02-4c80-9a54-b66f9f0ee7b8", - "outputId": "d90d51e7-2069-416e-b611-cee592e6f2cf" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'open-clip',\n", - " 'model': {'name': 'ViT-B-32',\n", - " 'pretrained': 'laion2b_s34b_b79k',\n", - " 'device': 'cpu',\n", - " 'batch_size': 64,\n", - " 'normalize': True},\n", - " 'source_column': 'image_uri',\n", - " 'vector_column': 'vector'}" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "\n", - "json.loads(table.schema.metadata[b\"embedding_functions\"])[0]" - ] - }, - { - "cell_type": "markdown", - "id": "29b64e40-8364-4efb-a0e6-2692af946f7c", - "metadata": { - "id": "29b64e40-8364-4efb-a0e6-2692af946f7c" - }, - "source": [ - "And we can also run queries as before without having to reinstantiate the embedding function explicitly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c994859-31f1-41cb-ae86-8f41ddc707f7", - "metadata": { - "id": "7c994859-31f1-41cb-ae86-8f41ddc707f7", - "outputId": "2ce1d522-0802-49bf-fbb5-e1d09a3443bb" - }, - "outputs": [ - { - "data": { - "image/jpeg": "", - "image/png": "", - "text/plain": [ - "" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rs = table.search(\"big dog\").limit(3).to_pydantic(Pets)\n", - "rs[0].image" - ] - }, - { - "cell_type": "markdown", - "id": "61a44b94-ca9e-42c4-82fc-73ab2f3cf424", - "metadata": { - "id": "61a44b94-ca9e-42c4-82fc-73ab2f3cf424" - }, - "source": [ - "## LanceDB makes multimodal AI easy\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "650b92d6-b666-49ba-ba46-d7a5ac077c6f", - "metadata": { - "id": "650b92d6-b666-49ba-ba46-d7a5ac077c6f" - }, - "source": [ - "- LanceDB's new embedding functions feature makes it easy for builders of LLM apps\n", - "- You no longer need to manually encode the data yourself\n", - "- You no longer need to figure out how many dimensions is your vector\n", - "- You no longer need to manually encode the query\n", - "- And with the right embedding model, you can search way more than just text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f96da468-d20d-4a18-9a58-a5d99789e5f4", - "metadata": { - "id": "f96da468-d20d-4a18-9a58-a5d99789e5f4" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/LlamaIndex_example.ipynb b/docs/src/notebooks/LlamaIndex_example.ipynb deleted file mode 100644 index 887b585e..00000000 --- a/docs/src/notebooks/LlamaIndex_example.ipynb +++ /dev/null @@ -1,538 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "2db56c9b", - "metadata": {}, - "source": [ - "\"Open" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "db0855d0", - "metadata": {}, - "source": [ - "# LanceDB Vector Store\n", - "In this notebook we are going to show how to use [LanceDB](https://www.lancedb.com) to perform vector searches in LlamaIndex" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f44170b2", - "metadata": {}, - "source": [ - "If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c84199c", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install llama-index llama-index-vector-stores-lancedb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a90ce34", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install lancedb==0.6.13 #Only required if the above cell installs an older version of lancedb (pypi package may not be released yet)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39c62671", - "metadata": {}, - "outputs": [], - "source": [ - "# Refresh vector store URI if restarting or re-using the same notebook\n", - "! rm -rf ./lancedb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59b54276", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "# Uncomment to see debug logs\n", - "# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", - "# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", - "\n", - "\n", - "from llama_index.core import SimpleDirectoryReader, Document, StorageContext\n", - "from llama_index.core import VectorStoreIndex\n", - "from llama_index.vector_stores.lancedb import LanceDBVectorStore\n", - "import textwrap" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "26c71b6d", - "metadata": {}, - "source": [ - "### Setup OpenAI\n", - "The first step is to configure the openai key. It will be used to created embeddings for the documents loaded into the index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67b86621", - "metadata": {}, - "outputs": [], - "source": [ - "import openai\n", - "\n", - "openai.api_key = \"sk-\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "073f0a68", - "metadata": {}, - "source": [ - "Download Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eef1b911", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-06-11 16:42:37-- https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 75042 (73K) [text/plain]\n", - "Saving to: ‘data/paul_graham/paul_graham_essay.txt’\n", - "\n", - "data/paul_graham/pa 100%[===================>] 73.28K --.-KB/s in 0.02s \n", - "\n", - "2024-06-11 16:42:37 (3.97 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n", - "\n" - ] - } - ], - "source": [ - "!mkdir -p 'data/paul_graham/'\n", - "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f7010b1d-d1bb-4f08-9309-a328bb4ea396", - "metadata": {}, - "source": [ - "### Loading documents\n", - "Load the documents stored in the `data/paul_graham/` using the SimpleDirectoryReader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c154dd4b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Document ID: cac1ba78-5007-4cf8-89ba-280264790115 Document Hash: fe2d4d3ef3a860780f6c2599808caa587c8be6516fe0ba4ca53cf117044ba953\n" - ] - } - ], - "source": [ - "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n", - "print(\"Document ID:\", documents[0].doc_id, \"Document Hash:\", documents[0].hash)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c0232fd1", - "metadata": {}, - "source": [ - "### Create the index\n", - "Here we create an index backed by LanceDB using the documents loaded previously. LanceDBVectorStore takes a few arguments.\n", - "- uri (str, required): Location where LanceDB will store its files.\n", - "- table_name (str, optional): The table name where the embeddings will be stored. Defaults to \"vectors\".\n", - "- nprobes (int, optional): The number of probes used. A higher number makes search more accurate but also slower. Defaults to 20.\n", - "- refine_factor: (int, optional): Refine the results by reading extra elements and re-ranking them in memory. Defaults to None\n", - "\n", - "- More details can be found at [LanceDB docs](https://lancedb.github.io/lancedb/ann_indexes)" - ] - }, - { - "cell_type": "markdown", - "id": "1f2e20ef", - "metadata": {}, - "source": [ - "##### For LanceDB cloud :\n", - "```python\n", - "vector_store = LanceDBVectorStore( \n", - " uri=\"db://db_name\", # your remote DB URI\n", - " api_key=\"sk_..\", # lancedb cloud api key\n", - " region=\"your-region\" # the region you configured\n", - " ...\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8731da62", - "metadata": {}, - "outputs": [], - "source": [ - "vector_store = LanceDBVectorStore(\n", - " uri=\"./lancedb\", mode=\"overwrite\", query_type=\"hybrid\"\n", - ")\n", - "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", - "\n", - "index = VectorStoreIndex.from_documents(\n", - " documents, storage_context=storage_context\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8ee4473a-094f-4d0a-a825-e1213db07240", - "metadata": {}, - "source": [ - "### Query the index\n", - "We can now ask questions using our index. We can use filtering via `MetadataFilters` or use native lance `where` clause." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eb6419b", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core.vector_stores import (\n", - " MetadataFilters,\n", - " FilterOperator,\n", - " FilterCondition,\n", - " MetadataFilter,\n", - ")\n", - "\n", - "from datetime import datetime\n", - "\n", - "\n", - "query_filters = MetadataFilters(\n", - " filters=[\n", - " MetadataFilter(\n", - " key=\"creation_date\",\n", - " operator=FilterOperator.EQ,\n", - " value=datetime.now().strftime(\"%Y-%m-%d\"),\n", - " ),\n", - " MetadataFilter(\n", - " key=\"file_size\", value=75040, operator=FilterOperator.GT\n", - " ),\n", - " ],\n", - " condition=FilterCondition.AND,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ee201930", - "metadata": {}, - "source": [ - "### Hybrid Search\n", - "\n", - "LanceDB offers hybrid search with reranking capabilities. For complete documentation, refer [here](https://lancedb.github.io/lancedb/hybrid_search/hybrid_search/).\n", - "\n", - "This example uses the `colbert` reranker. The following cell installs the necessary dependencies for `colbert`. If you choose a different reranker, make sure to adjust the dependencies accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e12d1454", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install -U torch transformers tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985" - ] - }, - { - "cell_type": "markdown", - "id": "c742cb07", - "metadata": {}, - "source": [ - "if you want to add a reranker at vector store initialization, you can pass it in the arguments like below :\n", - "```\n", - "from lancedb.rerankers import ColbertReranker\n", - "reranker = ColbertReranker()\n", - "vector_store = LanceDBVectorStore(uri=\"./lancedb\", reranker=reranker, mode=\"overwrite\")\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27ea047b", - "metadata": {}, - "outputs": [], - "source": [ - "import lancedb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8414517f", - "metadata": {}, - "outputs": [], - "source": [ - "from lancedb.rerankers import ColbertReranker\n", - "\n", - "reranker = ColbertReranker()\n", - "vector_store._add_reranker(reranker)\n", - "\n", - "query_engine = index.as_query_engine(\n", - " filters=query_filters,\n", - " # vector_store_kwargs={\n", - " # \"query_type\": \"fts\",\n", - " # },\n", - ")\n", - "\n", - "response = query_engine.query(\"How much did Viaweb charge per month?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc6ccb7a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Viaweb charged $100 a month for a small store and $300 a month for a big one.\n", - "metadata - {'65ed5f07-5b8a-4143-a939-e8764884828e': {'file_path': '/Users/raghavdixit/Desktop/open_source/llama_index_lance/docs/docs/examples/vector_stores/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11'}, 'be231827-20b8-4988-ac75-94fa79b3c22e': {'file_path': '/Users/raghavdixit/Desktop/open_source/llama_index_lance/docs/docs/examples/vector_stores/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11'}}\n" - ] - } - ], - "source": [ - "print(response)\n", - "print(\"metadata -\", response.metadata)" - ] - }, - { - "cell_type": "markdown", - "id": "0c1c6c73", - "metadata": {}, - "source": [ - "##### lance filters(SQL like) directly via the `where` clause :" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a2bcc07", - "metadata": {}, - "outputs": [], - "source": [ - "lance_filter = \"metadata.file_name = 'paul_graham_essay.txt' \"\n", - "retriever = index.as_retriever(vector_store_kwargs={\"where\": lance_filter})\n", - "response = retriever.retrieve(\"What did the author do growing up?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ac47cf9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "What I Worked On\n", - "\n", - "February 2021\n", - "\n", - "Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n", - "\n", - "The first programs I tried writing were on the IBM 1401 that our school district used for what was then called \"data processing.\" This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n", - "\n", - "The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it. The result would ordinarily be to print something on the spectacularly loud printer.\n", - "\n", - "I was puzzled by the 1401. I couldn't figure out what to do with it. And in retrospect there's not much I could have done with it. The only form of input to programs was data stored on punched cards, and I didn't have any data stored on punched cards. The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any programs I wrote, because they can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the data center manager's expression made clear.\n", - "\n", - "With microcomputers, everything changed. Now you could have a computer sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punch cards and then stopping. [1]\n", - "\n", - "The first of my friends to get a microcomputer built it himself. It was sold as a kit by Heathkit. I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the computer.\n", - "\n", - "Computers were expensive in those days and it took me years of nagging before I convinced my father to buy one, a TRS-80, in about 1980. The gold standard then was the Apple II, but a TRS-80 was good enough. This was when I really started programming. I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book. There was only room in memory for about 2 pages of text, so he'd write 2 pages at a time and then print them out, but it was a lot better than a typewriter.\n", - "\n", - "Though I liked programming, I didn't plan to study it in college. In college I was going to study philosophy, which sounded much more powerful. It seemed, to my naive high school self, to be the study of the ultimate truths, compared to which the things studied in other fields would be mere domain knowledge. What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn't much left for these supposed ultimate truths. All that seemed left for philosophy were edge cases that people in other fields felt could safely be ignored.\n", - "\n", - "I couldn't have put this into words when I was 18. All I knew at the time was that I kept taking philosophy courses and they kept being boring. So I decided to switch to AI.\n", - "\n", - "AI was in the air in the mid 1980s, but there were two things especially that made me want to work on it: a novel by Heinlein called The Moon is a Harsh Mistress, which featured an intelligent computer called Mike, and a PBS documentary that showed Terry Winograd using SHRDLU. I haven't tried rereading The Moon is a Harsh Mistress, so I don't know how well it has aged, but when I read it I was drawn entirely into its world.\n", - "metadata - {'file_path': '/Users/raghavdixit/Desktop/open_source/llama_index_lance/docs/docs/examples/vector_stores/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11'}\n" - ] - } - ], - "source": [ - "print(response[0].get_content())\n", - "print(\"metadata -\", response[0].metadata)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6afc84ac", - "metadata": {}, - "source": [ - "### Appending data\n", - "You can also add data to an existing index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "759a532e", - "metadata": {}, - "outputs": [], - "source": [ - "nodes = [node.node for node in response]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "069fc099", - "metadata": {}, - "outputs": [], - "source": [ - "del index\n", - "\n", - "index = VectorStoreIndex.from_documents(\n", - " [Document(text=\"The sky is purple in Portland, Maine\")],\n", - " uri=\"/tmp/new_dataset\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a64ed441", - "metadata": {}, - "outputs": [], - "source": [ - "index.insert_nodes(nodes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5cffcfe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Portland, Maine\n" - ] - } - ], - "source": [ - "query_engine = index.as_query_engine()\n", - "response = query_engine.query(\"Where is the sky purple?\")\n", - "print(textwrap.fill(str(response), 100))" - ] - }, - { - "cell_type": "markdown", - "id": "ec548a02", - "metadata": {}, - "source": [ - "You can also create an index from an existing table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc99404d", - "metadata": {}, - "outputs": [], - "source": [ - "del index\n", - "\n", - "vec_store = LanceDBVectorStore.from_table(vector_store._table)\n", - "index = VectorStoreIndex.from_vector_store(vec_store)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b2e8cca", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The author started Viaweb and Aspra.\n" - ] - } - ], - "source": [ - "query_engine = index.as_query_engine()\n", - "response = query_engine.query(\"What companies did the author start?\")\n", - "print(textwrap.fill(str(response), 100))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/Multivector_on_LanceDB.ipynb b/docs/src/notebooks/Multivector_on_LanceDB.ipynb deleted file mode 100644 index 2b133a9e..00000000 --- a/docs/src/notebooks/Multivector_on_LanceDB.ipynb +++ /dev/null @@ -1,667 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "KV1BD1ptEchv" - }, - "source": [ - "# Multivector Search: Efficient Document Retrieval with ColPali and LanceDB \n", - "\n", - "Modern documents—PDFs, scans, forms, invoices, or scientific diagrams—rely heavily on visual elements like tables, figures, and spatial layouts to convey meaning. Retrieving context from these documents poses unique challenges: \n", - "- 🖼️ **Loss of Context**: Plain-text extraction destroys critical visual relationships (e.g., a table's structure or a diagram's annotations). \n", - "- 🧩 **Multi-Modal Complexity**: Layouts combine text, images, and structured elements that require joint understanding. \n", - "- 📏 **Scale vs. Precision**: Balancing pixel-perfect accuracy with efficient search across millions of documents. \n", - "\n", - "### **Why Traditional Methods Fail** \n", - "The traditional method is a brittle, multi-stage pipeline where visual context is eroded at every step. Retrieval becomes a \"best guess\" based on partial text. Usually, it will involve the following steps:\n", - "1. OCR Text Extraction - extract raw text from scanned PDFs/images.\n", - "2. Layout Detection - use models like LayoutLM or rule-based tools to segment pages into regions (titles, tables, figures).\n", - "3. Structure Reconstruction - use heuristic rules or ML models try to infer reading order and hierarchy.\n", - "4. Optional: Image/Table Captioning - apply vision-language models (e.g., GPT-4V) to describe figures/tables in natural language.\n", - "5. Text Chunking - split text into fixed-size chunks or \"semantic\" passages (e.g., by paragraphs).\n", - "6. Embedding & Indexing- use text-based embeddings (e.g., BERT) and store in a vector DB (e.g., LanceDB).\n", - "\n", - "\n", - "## **Our Approach: ColPali with XTR for performant retrieval** \n", - "ColPali (Contextualized Late Interaction Over PaliGemma) enhances document retrieval by combining a vision-language model (VLM) with a multi-vector late interaction framework inspired by ColBERT. In this framework, documents and queries are encoded as collections of contextualized vectors—precomputed for documents and indexed for queries. Unlike traditional methods, late interaction defers complex similarity computations between query and document vectors until the final retrieval stage, enabling nuanced semantic matching while maintaining efficiency.\n", - "\n", - "To further accelerate retrieval, we integrate XTR (ConteXtualized Token Retriever), which prioritizes critical document tokens during initial retrieval stage and removes the gathering stage to significantly improve the performance. By focusing on the most semantically salient tokens early in the process, XTR reduces computational complexity with improved recall, ensuring rapid identification of candidate documents.\n", - "\n", - "We used the [UFO dataset](https://huggingface.co/datasets/davanstrien/ufo-ColPali), a dataset with rich tables, images and text, to demonstrate how to efficiently retrieve documents with ColPali and LanceDB.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9N9tRap0YYp4" - }, - "source": [ - "## Step 1: Install Required Libraries\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "gpdJn4hACfKH", - "outputId": "cbd70419-10be-4e4a-ba71-ac0d69a8ddbc" - }, - "outputs": [], - "source": [ - "!pip install lancedb colpali-engine datasets tqdm" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ksA8HdigzAUX" - }, - "source": [ - "## Step 2: Load the UFO dataset\n", - "\n", - "The UFO dataset has 2243 rows in total with an embedding of 128 dimension each. We show an example of the document to show how complicated it is with text and images blended in the document." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 966, - "referenced_widgets": [ - "d775b534a39344cba9e195a01c51be75", - "90e52ab802524ded8984c7b78a418a03", - "ae669a2953964f39b25dbd2e7751411d", - "98b6780cc7844590826e5cd88c137ea1", - "c8535071d9844f799906c05600c20054", - "e2278f45421341bb8b1734cba0af03cb", - "005c8d29152442a49c19a0a83af59e7a", - "a0bf2b988497416f84be6a5dc885f737", - "6d96275900c6476aaab29a63b6d60633", - "756d72d9a4ba4ee5b769f1195b102baa", - "11a92b5b3ceb4bd6a1999cbf782b1e64", - "b9a22198fef8401ebf0bc2e8c82375a1", - "ec250244f0b646319d681b05a04d82fe", - "e09850af523c4686a7a0dce2dc5bb5ff", - "6154a5ea5cd54dd19f0d068c776cb557", - "01d711bc69e84f41a72e0ce9a2b478ed", - "05b4d493f761440b91bc0854f04e40d7", - "e6b3bee55adb42779ddf515be4046558", - "93f5eb6c41634347832f80dd87c7a170", - "e4516c525f794c2b9b5a262cf7dae8a6", - "5e7ab3f1efae4dd491d503ac66b4d8ae", - "ac080da8dc854206a2569d976b1e678c", - "85af8ed925b64edda995c303bbd056dc", - "9e29e04a66d04dad8c20d27e6b728c08", - "672f5f6abe484d189fb3935ed8dfedbe", - "6d1be941a7d747159a14b9ed0d17c870", - "05e78c7519de4f8d89b26f48dc939653", - "f0e80b24db734aa58da30af62d1b4a45", - "615c1d314b3e4af893d6ac7a1bba947d", - "0796ad794d8246f7a6b68a10dab2507a", - "08ba8c88373440f8902641a9ca2af659", - "95d85d6556e047a4b923cc2464f85038", - "0ded705820f14618a77cb022c5f7e60d" - ] - }, - "id": "LlclgxJPD5RX", - "outputId": "ada253b5-2a73-4a66-f20e-7ff0725db89a" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d775b534a39344cba9e195a01c51be75", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "README.md: 0%| | 0.00/1.20k [00:00" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from math import sqrt\n", - "\n", - "import pyarrow as pa\n", - "from tqdm import tqdm\n", - "import lancedb\n", - "from datasets import load_dataset\n", - "from colpali_engine.models import ColPali, ColPaliProcessor\n", - "import torch\n", - "\n", - "dataset = load_dataset(\"davanstrien/ufo-ColPali\", split=\"train\")\n", - "dataset\n", - "dataset[333][\"image\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JNOEkERhHPj2" - }, - "source": [ - "## Step 3: Load the ColPali model\n", - "Note: select \"cuda\" if you are using a Nvidia GPU or \"cpu\" if there is no GPU available. Mac users, please use \"mps\". This step can take a few minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 209, - "referenced_widgets": [ - "cde94d99d80b4cd2840d059915fa013c", - "68ad5106210e41ab8e734654efb72a15", - "c97d2a0f056a489796e2a3d94d215e7a", - "357592d483cf455bb8c1827206c4095e", - "2e4c2c91a55245948ab081f28c1b82f8", - "0ba339486c664527bdc054b9b263788c", - "f3ffd4ad251444b5b3853892283647eb", - "36c8a4e9678441ddaab31058055bd1f6", - "e1c7d6755a0b441789f11e8036f64887", - "aca80906222b44f3bc07f853d1bd8bec", - "5553411ece384eab80f386ff96baf699", - "6566b65210dd43999016b0df0849fc73", - "a553e15b721c48ddb97a4598d86666a9", - "e1a8b2ca2f4646f8845418b16fc59d5a", - "807f767e339a46b294722598a81c3e59", - "70fa11934f7248ebb747ec3c596fc701", - "d19ede522db8485a90856d53faa8c9d0", - "bb52b41e3c8c41318818e67b56fc65c5", - "f62b475cd41c4851a81b975d83b52749", - "d5de815e1a8248a4b9575fa5598c5c62", - "bfa1711e640c4201a11df8d9afae050e", - "09d915c9edf549f6b232cc8ce7cfc996", - "cdd4ecaaecf245f7af640c257b9a2766", - "da332a65a37749339228b607235d9c7a", - "7fc947f4987c4fd98d40bdbccfb5258e", - "13a43832485f48baa1ba6ba4fce79a6d", - "a9b76d6b46304e6e9ad0c6453b2ed586", - "6f469e077ca84d23982f613370e39cea", - "2026e624f025456c883bff0c8049018f", - "ef20ca0fb9de4cab8565c9d97caa8b95", - "5158feb04bf64f83acccae97a0865dc3", - "d876b1afc539414e89faa4c006bf1817", - "4189dce33b624109981b95ac07772da7", - "f9d3b4ce592f4ab9b02d8b9edff02009", - "ede10e47e8e44e3e8e01e1ece68d5153", - "350b217283524a57bd99046cedf5ba40", - "3b71fe200a0649a193c756a64795caa1", - "2df22e0c5104439bb7016684af2410ed", - "3f6a812bdcf24e268a503c7ae301c9cd", - "5404823a8c46495a9270c01f73b4ef7c", - "dbf17f8bcc6c4b38957e68182f8ca8ed", - "60940513638a4cf19b11c1ac6c4ebb92", - "6797a69ac9994287b925f85bebe75d94", - "73190fa8b6ff419790fea7ad08f39fe6", - "ae63c7933e4f4d4e96444bc62278c8fa", - "fd374e9970944060a5ebb8945f40c011", - "ac58985e76c847409cca99656873af12", - "c6329d230cf840fc9ec6f218dc138f36", - "6ec05172da164cdebf0f3b7f7ad8f261", - "832e39123e7748b8a0f38f146748a257", - "489bddd5e8144339b564790884cac871", - "55110dbbd8e84f5cb4ffebe941773fcd", - "61dbfaf2b1074ec9852a3f0fa0f4852b", - "9df60a8d17ab4d68a985ff9fc4646a19", - "0b024509a62441b7ae9820c8ae3c1ffc", - "d86ef06f81b54514b5f39355adc03a25", - "b47747512c2e4da3be712bb365d83639", - "30cc200421da4851b748dbf99a5ba170", - "c3d9356deb674b15ad984f23f4503262", - "61027258a71d4dfdb3c8529c84eadcdd", - "0bbe0ea286a140e38271cb2cf941788c", - "c835a0d14e7a41d8bec9fbffe5825cc2", - "ea756961fae74bad8f72c962c052167b", - "adf0fd690527498faa8467c966ea516f", - "cc1e5b6f7d114c48a5ac586ed5726990", - "4e326b3e73c64666976e24301297927c" - ] - }, - "id": "ilFhcjuoEOs-", - "outputId": "1db488c2-b574-45f2-a38a-c913b597829e" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cde94d99d80b4cd2840d059915fa013c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/2 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "for image in image_results:\n", - " plt.figure()\n", - " plt.imshow(image)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 5a: Embed the UFO dataset and ingest data into LanceDB\n", - "\n", - "Note: This step will take up to 2h when running with T4 GPU with a `batch_size=4`. You can increase the `batch_size` to accelerate the process if there is more memory available, e.g. `batch_size=32` requires 60GB of memory.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# this would take 40 mins for first run on Apple M3 Max, may be longer if you are using CPU\n", - "batch_size = 4 # low it if you have a low memory GPU\n", - "with tqdm(total=len(dataset), desc=\"ingesting\") as pbar:\n", - " for i in range(0, len(dataset), batch_size):\n", - " batch = dataset[i : i + batch_size]\n", - " images = batch[\"image\"]\n", - "\n", - " # encode the images\n", - " with torch.no_grad():\n", - " batch_images = colpali_processor.process_images(images).to(\n", - " colpali_model.device\n", - " )\n", - " image_embeddings = colpali_model(**batch_images)\n", - "\n", - " real_size = len(images)\n", - " multivector = image_embeddings.cpu().float().numpy()\n", - " multivector = pa.array(multivector.tolist(), type=multivector_type)\n", - " data = pa.Table.from_pydict(\n", - " {\n", - " \"id\": list(range(i, i + real_size)),\n", - " \"vector\": multivector,\n", - " }\n", - " )\n", - " table.add(data)\n", - " pbar.update(real_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 6: Create an index on the multivector column\n", - "Note: LanceDB Cloud automatically infers the multivector column directly from the schema. If your dataset contains only one column with a list of vectors, no manual specification is required when building the vector index—the system handles this implicitly.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "num_rows = table.count_rows()\n", - "table.create_index(\n", - " metric=\"cosine\", # for now only cosine is supported for multivector\n", - " num_partitions=int(\n", - " sqrt(num_rows * 1030) # 1030 is number of embeddings per document\n", - " ), # it's recommended to set sqrt of the number of embeddings as the number of partitions\n", - " num_sub_vectors=32, # higher for accuracy, lower for speed\n", - " index_type=\"IVF_PQ\",\n", - ")" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/src/notebooks/diffusiondb/datagen.py b/docs/src/notebooks/diffusiondb/datagen.py deleted file mode 100755 index f27f6a31..00000000 --- a/docs/src/notebooks/diffusiondb/datagen.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python -# - - -"""Dataset hf://poloclub/diffusiondb -""" - -import io -from argparse import ArgumentParser -from multiprocessing import Pool - -import lance -import pyarrow as pa -from datasets import load_dataset -from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast - - -MODEL_ID = "openai/clip-vit-base-patch32" - -device = "cuda" - -tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID) -model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) -processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") - -schema = pa.schema( - [ - pa.field("prompt", pa.string()), - pa.field("seed", pa.uint32()), - pa.field("step", pa.uint16()), - pa.field("cfg", pa.float32()), - pa.field("sampler", pa.string()), - pa.field("width", pa.uint16()), - pa.field("height", pa.uint16()), - pa.field("timestamp", pa.timestamp("s")), - pa.field("image_nsfw", pa.float32()), - pa.field("prompt_nsfw", pa.float32()), - pa.field("vector", pa.list_(pa.float32(), 512)), - pa.field("image", pa.binary()), - ] -) - - -def pil_to_bytes(img) -> list[bytes]: - buf = io.BytesIO() - img.save(buf, format="PNG") - return buf.getvalue() - - -def generate_clip_embeddings(batch) -> pa.RecordBatch: - image = processor(text=None, images=batch["image"], return_tensors="pt")[ - "pixel_values" - ].to(device) - img_emb = model.get_image_features(image) - batch["vector"] = img_emb.cpu().tolist() - - with Pool() as p: - batch["image_bytes"] = p.map(pil_to_bytes, batch["image"]) - return batch - - -def datagen(args): - """Generate DiffusionDB dataset, and use CLIP model to generate image embeddings.""" - dataset = load_dataset("poloclub/diffusiondb", args.subset) - data = [] - for b in dataset.map( - generate_clip_embeddings, batched=True, batch_size=256, remove_columns=["image"] - )["train"]: - b["image"] = b["image_bytes"] - del b["image_bytes"] - data.append(b) - tbl = pa.Table.from_pylist(data, schema=schema) - return tbl - - -def main(): - parser = ArgumentParser() - parser.add_argument( - "-o", "--output", metavar="DIR", help="Output lance directory", required=True - ) - parser.add_argument( - "-s", - "--subset", - choices=["2m_all", "2m_first_10k", "2m_first_100k"], - default="2m_first_10k", - help="subset of the hg dataset", - ) - - args = parser.parse_args() - - batches = datagen(args) - lance.write_dataset(batches, args.output) - - -if __name__ == "__main__": - main() diff --git a/docs/src/notebooks/diffusiondb/requirements.txt b/docs/src/notebooks/diffusiondb/requirements.txt deleted file mode 100644 index 4baee306..00000000 --- a/docs/src/notebooks/diffusiondb/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -datasets -Pillow -lancedb -isort -black -transformers ---index-url https://download.pytorch.org/whl/cu118 -torch -torchvision diff --git a/docs/src/notebooks/embedding_tuner.ipynb b/docs/src/notebooks/embedding_tuner.ipynb deleted file mode 100644 index 81972814..00000000 --- a/docs/src/notebooks/embedding_tuner.ipynb +++ /dev/null @@ -1,1437 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Improve retrieval performance by Fine-tuning embedding model\n", - "\n", - "Another way to improve retriever performance is to fine-tune the embedding model itself. Fine-tuning the embedding model can help in learning better representations for the documents and queries in the dataset. This can be particularly useful when the dataset is very different from the pre-trained data used to train the embedding model." - ], - "metadata": { - "id": "rYMbEXANHZ0B" - } - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "6T7bwebVquFE", - "outputId": "55bea6d1-631f-409e-9b7b-cb441d26102a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 12.0.1 which is incompatible.\n", - "datasets 2.20.0 requires pyarrow>=15.0.0, but you have pyarrow 12.0.1 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "%pip install llama-index-llms-openai llama-index-embeddings-openai llama-index-finetuning llama-index-readers-file scikit-learn llama-index-embeddings-huggingface llama-index-vector-stores-lancedb pyarrow==12.0.1 -qq" - ] - }, - { - "cell_type": "code", - "source": [ - "# For eval utils\n", - "!git clone https://github.com/lancedb/ragged.git\n", - "!cd ragged && pip install .\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6RRNyCDJDEcQ", - "outputId": "bbcb0689-e82f-4593-f53c-77c3443a929d" - }, - "execution_count": 22, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'ragged'...\n", - "remote: Enumerating objects: 160, done.\u001b[K\n", - "remote: Counting objects: 100% (160/160), done.\u001b[K\n", - "remote: Compressing objects: 100% (103/103), done.\u001b[K\n", - "remote: Total 160 (delta 70), reused 125 (delta 41), pack-reused 0\u001b[K\n", - "Receiving objects: 100% (160/160), 38.15 KiB | 9.54 MiB/s, done.\n", - "Resolving deltas: 100% (70/70), done.\n", - "Processing /content/ragged\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting datasets (from ragged==0.1.dev0)\n", - " Downloading datasets-2.20.0-py3-none-any.whl (547 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.8/547.8 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: lancedb in /usr/local/lib/python3.10/dist-packages (from ragged==0.1.dev0) (0.9.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from ragged==0.1.dev0) (2.0.3)\n", - "Collecting streamlit (from ragged==0.1.dev0)\n", - " Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.6/8.6 MB\u001b[0m \u001b[31m54.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tantivy in /usr/local/lib/python3.10/dist-packages (from ragged==0.1.dev0) (0.22.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (3.15.4)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (1.25.2)\n", - "Collecting pyarrow>=15.0.0 (from datasets->ragged==0.1.dev0)\n", - " Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 MB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (0.6)\n", - "Collecting dill<0.3.9,>=0.3.0 (from datasets->ragged==0.1.dev0)\n", - " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m20.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting requests>=2.32.2 (from datasets->ragged==0.1.dev0)\n", - " Downloading requests-2.32.3-py3-none-any.whl (64 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (4.66.4)\n", - "Collecting xxhash (from datasets->ragged==0.1.dev0)\n", - " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m29.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting multiprocess (from datasets->ragged==0.1.dev0)\n", - " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.5.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (2023.6.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (3.9.5)\n", - "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (0.23.4)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (24.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets->ragged==0.1.dev0) (6.0.1)\n", - "Requirement already satisfied: deprecation in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (2.1.0)\n", - "Requirement already satisfied: pylance==0.13.0 in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (0.13.0)\n", - "Requirement already satisfied: ratelimiter~=1.0 in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (1.2.0.post0)\n", - "Requirement already satisfied: retry>=0.9.2 in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (0.9.2)\n", - "Requirement already satisfied: pydantic>=1.10 in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (2.8.0)\n", - "Requirement already satisfied: attrs>=21.3.0 in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (23.2.0)\n", - "Requirement already satisfied: cachetools in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (5.3.3)\n", - "Requirement already satisfied: overrides>=0.7 in /usr/local/lib/python3.10/dist-packages (from lancedb->ragged==0.1.dev0) (7.7.0)\n", - "Collecting pyarrow>=15.0.0 (from datasets->ragged==0.1.dev0)\n", - " Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.3/38.3 MB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->ragged==0.1.dev0) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->ragged==0.1.dev0) (2023.4)\n", - "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->ragged==0.1.dev0) (2024.1)\n", - "Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (4.2.2)\n", - "Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit->ragged==0.1.dev0) (1.4)\n", - "Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (8.1.7)\n", - "Requirement already satisfied: pillow<11,>=7.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (9.4.0)\n", - "Requirement already satisfied: protobuf<6,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (3.20.3)\n", - "Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (13.7.1)\n", - "Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (8.3.0)\n", - "Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (0.10.2)\n", - "Requirement already satisfied: typing-extensions<5,>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (4.12.2)\n", - "Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit->ragged==0.1.dev0)\n", - " Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.3/207.3 kB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit->ragged==0.1.dev0)\n", - " Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.9/6.9 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit->ragged==0.1.dev0) (6.3.3)\n", - "Collecting watchdog<5,>=2.1.5 (from streamlit->ragged==0.1.dev0)\n", - " Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.0/83.0 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit->ragged==0.1.dev0) (0.4)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit->ragged==0.1.dev0) (3.1.4)\n", - "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit->ragged==0.1.dev0) (4.19.2)\n", - "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit->ragged==0.1.dev0) (0.12.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->ragged==0.1.dev0) (1.3.1)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->ragged==0.1.dev0) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->ragged==0.1.dev0) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->ragged==0.1.dev0) (1.9.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->ragged==0.1.dev0) (4.0.3)\n", - "Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit->ragged==0.1.dev0)\n", - " Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->lancedb->ragged==0.1.dev0) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.20.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->lancedb->ragged==0.1.dev0) (2.20.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->ragged==0.1.dev0) (1.16.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets->ragged==0.1.dev0) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets->ragged==0.1.dev0) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets->ragged==0.1.dev0) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets->ragged==0.1.dev0) (2024.6.2)\n", - "Requirement already satisfied: decorator>=3.4.2 in /usr/local/lib/python3.10/dist-packages (from retry>=0.9.2->lancedb->ragged==0.1.dev0) (4.4.2)\n", - "Requirement already satisfied: py<2.0.0,>=1.4.26 in /usr/local/lib/python3.10/dist-packages (from retry>=0.9.2->lancedb->ragged==0.1.dev0) (1.11.0)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit->ragged==0.1.dev0) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit->ragged==0.1.dev0) (2.16.1)\n", - "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit->ragged==0.1.dev0)\n", - " Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->altair<6,>=4.0->streamlit->ragged==0.1.dev0) (2.1.5)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit->ragged==0.1.dev0) (2023.12.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit->ragged==0.1.dev0) (0.35.1)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit->ragged==0.1.dev0) (0.18.1)\n", - "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit->ragged==0.1.dev0) (0.1.2)\n", - "Building wheels for collected packages: ragged\n", - " Building wheel for ragged (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for ragged: filename=ragged-0.1.dev0-py3-none-any.whl size=24662 sha256=d086741b289188a92153223fdb65db69f9297a523c7874746fd1669f7d3f9c07\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-q327t6y_/wheels/aa/3f/b0/d70e6f86074491db9b0bc7431c11f0138f2ed2359151509cf7\n", - "Successfully built ragged\n", - "Installing collected packages: xxhash, watchdog, smmap, requests, pyarrow, dill, pydeck, multiprocess, gitdb, gitpython, datasets, streamlit, ragged\n", - " Attempting uninstall: requests\n", - " Found existing installation: requests 2.31.0\n", - " Uninstalling requests-2.31.0:\n", - " Successfully uninstalled requests-2.31.0\n", - " Attempting uninstall: pyarrow\n", - " Found existing installation: pyarrow 12.0.1\n", - " Uninstalling pyarrow-12.0.1:\n", - " Successfully uninstalled pyarrow-12.0.1\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 15.0.0 which is incompatible.\n", - "google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed datasets-2.20.0 dill-0.3.8 gitdb-4.0.11 gitpython-3.1.43 multiprocess-0.70.16 pyarrow-15.0.0 pydeck-0.9.1 ragged-0.1.dev0 requests-2.32.3 smmap-5.0.1 streamlit-1.36.0 watchdog-4.0.1 xxhash-3.4.1\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## The dataset\n", - "The dataset we'll use is a synthetic QA dataset generated from LLama2 review paper. The paper was divided into chunks, with each chunk being a unique context. An LLM was prompted to ask questions relevant to the context for testing a retriever.\n", - "The exact code and other utility functions for this can be found in [this](https://github.com/lancedb/ragged) repo\n" - ], - "metadata": { - "id": "B_2S_b0c3pdp" - } - }, - { - "cell_type": "code", - "source": [ - "!wget https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4QFDh3jD3d1X", - "outputId": "642f53c8-a084-4c34-db6a-bfee35abbd28" - }, - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2024-07-09 20:37:46-- https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 680439 (664K) [text/plain]\n", - "Saving to: ‘data_qa.csv’\n", - "\n", - "data_qa.csv 100%[===================>] 664.49K --.-KB/s in 0.006s \n", - "\n", - "2024-07-09 20:37:47 (100 MB/s) - ‘data_qa.csv’ saved [680439/680439]\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "\n", - "data = pd.read_csv(\"data_qa.csv\")" - ], - "metadata": { - "id": "AIF2zczc3kwW" - }, - "execution_count": 9, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Pre-processing\n", - "Now we need to parse the context(corpus) of the dataset as llama-index text nodes. " - ], - "metadata": { - "id": "_xV40VSy3twE" - } - }, - { - "cell_type": "code", - "source": [ - "from pathlib import Path\n", - "from llama_index.core.node_parser import SentenceSplitter\n", - "from llama_index.readers.file import PagedCSVReader\n", - "\n", - "def load_corpus(file, verbose=False):\n", - " if verbose:\n", - " print(f\"Loading files {file}...\")\n", - "\n", - " loader = PagedCSVReader(encoding=\"utf-8\")\n", - " docs = loader.load_data(file=Path(file))\n", - "\n", - " if verbose:\n", - " print(f\"Loaded {len(docs)} docs\")\n", - "\n", - " parser = SentenceSplitter()\n", - " nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)\n", - "\n", - " if verbose:\n", - " print(f\"Parsed {len(nodes)} nodes\")\n", - "\n", - " return nodes" - ], - "metadata": { - "id": "mzDZYUX4qxBC" - }, - "execution_count": 10, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv(\"data_qa.csv\", index_col=0)" - ], - "metadata": { - "id": "eoLOdNO-4HbV" - }, - "execution_count": 11, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"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\"" - ], - "metadata": { - "id": "EqsFZ5KYqzvg" - }, - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Split into train and validation sets. We'll use the original df for val as that has different queries generated via a different prompt.\n" - ], - "metadata": { - "id": "zrwa35x96FLZ" - } - }, - { - "cell_type": "code", - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Randomly shuffle df.\n", - "#df = df.sample(frac=1, random_state=42)\n", - "\n", - "train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)\n", - "\n", - "train_df.to_csv(\"train_data_qa.csv\", index=False)\n", - "val_df.to_csv(\"val_data_qa.csv\", index=False)" - ], - "metadata": { - "id": "diHhY9Ipq9Uw" - }, - "execution_count": 13, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "train_nodes = load_corpus(\"train_data_qa.csv\", verbose=True)\n", - "val_nodes = load_corpus(\"val_data_qa.csv\", verbose=True)" - ], - "metadata": { - "id": "C7PKGtXPq_Fc", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188, - "referenced_widgets": [ - "3c85bfeaccc84a47844c770fa1fb2511", - "7461a200b0634607ac479708e3cba537", - "156a3c94ba094fbf86e70681c69ca31a", - "4261378f06ed48cc8cef251cd2c096ab", - "59aeeeae529a440fab4c231501fce4f6", - "1308faaa9fa944b2b17e96e8cd9a9445", - "83c0d87febcc4dfaa95b4d3e2005a416", - "f503b1be4e2e42c8bf4460eea2f1bb07", - "8228bf5a569844d584003446649731a6", - "2255fb2d83734ef88843ffe47116da84", - "9f30b1969bf24b86b8deaa41ea7231f6", - "f55c2f3c448741819e618b44bc0b1976", - "b0bad294bb6443388b77f854c4f77569", - "812696b2a65c4ca281da45f286ab95cf", - "9a026ebe3c8b416e9c5c4d7dd05bba66", - "af4dfd45973d466cb5c78d002c723cd6", - "211ee4b118154b0a94cbc686fdf90c55", - "3929b1c14657468792c74c6610598af5", - "222f778312d745aebc6f1d33c651dca8", - "d65d92433f304e389e0ce8aa7baf7155", - "a81c89c5c5a64ad0938d8d1e9789838c", - "95a2e33e8d244d24813b386e60301a2a" - ] - }, - "outputId": "bcb428bd-5d02-444c-e456-22260402faa8" - }, - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Loading files train_data_qa.csv...\n", - "Loaded 176 docs\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Parsing nodes: 0%| | 0/176 [00:00\"Open\n", - "\n", - "The code below is an example of hybrid search, a search algorithm that combines FTS and vector search in LanceDB.\n", - "\n", - "Let's get stared with an example. In this notebook we'll use Airbnb financial data documents to search for \"the specific reasons for higher operating costs\" in a particular year." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "819fa612", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "819fa612", - "outputId": "f5593c76-573f-4a04-d0ce-aac7b7ee1466" - }, - "outputs": [], - "source": [ - "# Setup\n", - "!pip install lancedb pandas langchain langchain_openai langchain-community pypdf openai cohere tiktoken sentence_transformers tantivy==0.20.1" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b6864d97-7f85-4d9c-bf05-e9cf9db29e81", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "b6864d97-7f85-4d9c-bf05-e9cf9db29e81", - "outputId": "6c6dd78d-3213-4bd8-9e74-5faba902e546" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "··········\n" - ] - } - ], - "source": [ - "import os\n", - "import getpass\n", - "\n", - "# Set your OpenAI API key\n", - "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "cfce9804-cd1c-48c3-acd2-e74eb4e290c7", - "metadata": { - "id": "cfce9804-cd1c-48c3-acd2-e74eb4e290c7" - }, - "outputs": [], - "source": [ - "def pretty_print(docs):\n", - " for doc in docs:\n", - " print(doc + \"\\n\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "efb22cec-5a06-46ac-91c3-53f9b9090109", - "metadata": { - "id": "efb22cec-5a06-46ac-91c3-53f9b9090109" - }, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import PyPDFLoader\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "\n", - "# Load $ABNB's financial report. This may take 1-2 minutes since the PDF is large\n", - "sec_filing_pdf = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001559720/8a9ebed0-815a-469a-87eb-1767d21d8cec.pdf\"\n", - "\n", - "# Create your PDF loader\n", - "loader = PyPDFLoader(sec_filing_pdf)\n", - "\n", - "# Load the PDF document\n", - "documents = loader.load()\n", - "\n", - "# Chunk the financial report\n", - "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)\n", - "docs = text_splitter.split_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3c5ce69-0f75-44cb-9e49-9be665fc156e", - "metadata": { - "id": "d3c5ce69-0f75-44cb-9e49-9be665fc156e" - }, - "outputs": [], - "source": [ - "from langchain_community.vectorstores import LanceDB\n", - "from langchain_openai import OpenAIEmbeddings\n", - "import lancedb\n", - "\n", - "\n", - "embedding_function = OpenAIEmbeddings()\n", - "\n", - "db = lancedb.connect(\"~/langchain\")\n", - "\n", - "# Load the document into LanceDB\n", - "db = LanceDB.from_documents(docs, embedding_function, connection=db)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4bd12fb8", - "metadata": {}, - "outputs": [], - "source": [ - "table = db._table\n", - "table.create_fts_index(\"text\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "d959a80f-d568-48f4-9d14-7367bcc1ce8d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 293 - }, - "id": "d959a80f-d568-48f4-9d14-7367bcc1ce8d", - "outputId": "b02f837d-7fa6-4ec2-b283-e170f5f67637" - }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"table\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"vector\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"ddcfa6b1-3de8-4933-a187-6aa7b7ae87b4\",\n \"47f5dd55-b3e7-4879-afba-5ca9eea7341b\",\n \"c391b1e1-6f66-41f2-82ff-18db5a218303\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Class A common stock, par value $0.0001 per share ABNB The Nasdaq Stock Market\\nSecurities registered pursuant to Section 12(g) of the Act:\\nNone______________\\nIndicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes \\u2612 No \\u2610 \\nIndicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Exchange Act. Yes \\u2610 No \\u2612 \\nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12\\nmonths (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes \\u2612 No \\u2610 \\nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (\\u00a7 232.405 of\",\n \"As of June 30, 2022, the aggregate market value of the Class A common stock held by non-affiliates of the registrant was approximately $35.1 billion based upon the closing price\\nreported for such date on the NASDAQ Global Select Market.\\nAs of February 3, 2023, 408,928,427 shares of the registrant's Class A common stock were outstanding 222,400,067 shares of the registrant's Class B common stock were\\noutstanding, no shares of the registrant\\u2019s Class C common stock were outstanding, and 9,200,000 shares of the registrant\\u2019s Class H common stock were outstanding.\\n______________\\nDOCUMENTS INCORPORATED BY REFERENCE\\nThe information required by Part III of this Report, to the extent not set forth herein, is incorporated herein by reference from the registrant\\u2019s definitive proxy statement relating to the\",\n \"this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files). Yes \\u2612 No \\u2610 \\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company.\\nSee the definitions of \\u201clarge accelerated filer,\\u201d \\u201caccelerated filer,\\u201d \\u201csmaller reporting company\\u201d and \\u201cemerging growth company\\u201d in Rule 12b-2 of the Exchange Act.\\nLarge accelerated filer \\u2612 Accelerated filer\\u2610 \\nNon-accelerated filer \\u2610 Smaller reporting company\\u2610 \\nEmerging growth company\\u2610 \\nIf an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or revised financial\\naccounting standards provided pursuant to Section 13(a) of the Exchange Act. \\u2610\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"metadata\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectoridtextmetadata
0[-0.0016961554, -0.03531899, 0.011809787, -0.0...5c66d086-0fed-4270-a91b-c2b67b3ed052Table of Contents\\nUNITED STATES\\nSECURITIES A...{'page': 0, 'source': 'https://d18rn0p25nwr6d....
1[-0.021446472, -0.021045355, 0.010823516, -0.0...ddcfa6b1-3de8-4933-a187-6aa7b7ae87b4Class A common stock, par value $0.0001 per sh...{'page': 0, 'source': 'https://d18rn0p25nwr6d....
2[-0.020018686, -0.014233166, -0.010991167, -0....c391b1e1-6f66-41f2-82ff-18db5a218303this chapter) during the preceding 12 months (...{'page': 0, 'source': 'https://d18rn0p25nwr6d....
3[-0.019061018, -0.0022632438, -0.011158161, -0...3e896a62-8631-4a54-86bd-ee2f69f3b373Indicate by check mark whether the registrant ...{'page': 0, 'source': 'https://d18rn0p25nwr6d....
4[-0.015733723, -0.012287037, -0.004055117, -0....47f5dd55-b3e7-4879-afba-5ca9eea7341bAs of June 30, 2022, the aggregate market valu...{'page': 1, 'source': 'https://d18rn0p25nwr6d....
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " vector \\\n", - "0 [-0.0016961554, -0.03531899, 0.011809787, -0.0... \n", - "1 [-0.021446472, -0.021045355, 0.010823516, -0.0... \n", - "2 [-0.020018686, -0.014233166, -0.010991167, -0.... \n", - "3 [-0.019061018, -0.0022632438, -0.011158161, -0... \n", - "4 [-0.015733723, -0.012287037, -0.004055117, -0.... \n", - "\n", - " id \\\n", - "0 5c66d086-0fed-4270-a91b-c2b67b3ed052 \n", - "1 ddcfa6b1-3de8-4933-a187-6aa7b7ae87b4 \n", - "2 c391b1e1-6f66-41f2-82ff-18db5a218303 \n", - "3 3e896a62-8631-4a54-86bd-ee2f69f3b373 \n", - "4 47f5dd55-b3e7-4879-afba-5ca9eea7341b \n", - "\n", - " text \\\n", - "0 Table of Contents\\nUNITED STATES\\nSECURITIES A... \n", - "1 Class A common stock, par value $0.0001 per sh... \n", - "2 this chapter) during the preceding 12 months (... \n", - "3 Indicate by check mark whether the registrant ... \n", - "4 As of June 30, 2022, the aggregate market valu... \n", - "\n", - " metadata \n", - "0 {'page': 0, 'source': 'https://d18rn0p25nwr6d.... \n", - "1 {'page': 0, 'source': 'https://d18rn0p25nwr6d.... \n", - "2 {'page': 0, 'source': 'https://d18rn0p25nwr6d.... \n", - "3 {'page': 0, 'source': 'https://d18rn0p25nwr6d.... \n", - "4 {'page': 1, 'source': 'https://d18rn0p25nwr6d.... " - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.to_pandas().head()" - ] - }, - { - "cell_type": "markdown", - "id": "667f4e4a-6ff1-4f1c-ad57-4a2a8b036670", - "metadata": { - "id": "667f4e4a-6ff1-4f1c-ad57-4a2a8b036670" - }, - "source": [ - "## Vector Search\n", - "\n", - "Average latency: `3.48 ms ± 71.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)`" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "8a5ab2de-6d75-4785-b838-ed6a825dfa6e", - "metadata": { - "id": "8a5ab2de-6d75-4785-b838-ed6a825dfa6e" - }, - "outputs": [], - "source": [ - "str_query = \"What are the specific factors contributing to Airbnb's increased operational expenses in the last fiscal year?\"\n", - "query = embedding_function.embed_query(str_query)\n", - "docs = table.search(query, query_type=\"vector\").limit(5).to_pandas()[\"text\"].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "5423d333-0f6d-4951-ab3f-6941ad30ba8a", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5423d333-0f6d-4951-ab3f-6941ad30ba8a", - "outputId": "79557d98-85d1-4a18-db42-10d7adffb7c2" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatenedenforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", - "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", - "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", - "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", - "\n", - "\n", - "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", - "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", - "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", - "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", - "General and Administrative\n", - "2021 2022 % Change\n", - "(in millions, except percentages)\n", - "General and administrative $ 836 $ 950 14 %\n", - "Percentage of revenue 14 % 11 %\n", - "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", - "\n", - "\n", - "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", - "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", - "• events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", - "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-relatedevents, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", - "climate change on seasonal destinations;\n", - "• political, social, or economic instability;\n", - "\n", - "\n", - "• Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;• increased competition and use of our competitors’ platforms and services;\n", - "• Hosts failing to provide differentiated, high-quality, and an adequate supply of stays or experiences at competitive prices;\n", - "• guests not receiving timely and adequate community support from us;\n", - "• our failure to provide new or enhanced offerings, tiers, or features that guests value;\n", - "• declines or inefficiencies in our marketing efforts;• negative associations with, or reduced awareness of, our brand;\n", - "• actual or perceived discrimination by Hosts in deciding whether to accept a requested reservation;\n", - "• negative perceptions of the trust and safety on our platform; and\n", - "• macroeconomic and other conditions outside of our control affecting travel and hospitality industries generally.\n", - "\n", - "\n", - "Table of Contents\n", - "Airbnb, Inc.\n", - "Consolidated Statements of Operations\n", - "(in millions, except per share amounts)\n", - "Year Ended December 31,\n", - "2020 2021 2022\n", - "Revenue $ 3,378 $ 5,992 $ 8,399 \n", - "Costs and expenses:\n", - "Cost of revenue 876 1,156 1,499 \n", - "Operations and support 878 847 1,041 \n", - "Product development 2,753 1,425 1,502 \n", - "Sales and marketing 1,175 1,186 1,516 \n", - "General and administrative 1,135 836 950 \n", - "Restructuring charges 151 113 89 \n", - "Total costs and expenses 6,968 5,563 6,597 \n", - "Income (loss) from operations (3,590) 429 1,802 \n", - "Interest income 27 13 186 \n", - "Interest expense (172) (438) (24)\n", - "Other income (expense), net (947) (304) 25 \n", - "Income (loss) before income taxes (4,682) (300) 1,989 \n", - "Provision for (benefit from) income taxes (97) 52 96 \n", - "Net income (loss) $ (4,585)$ (352)$ 1,893 \n", - "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", - "Basic $ (16.12)$ (0.57)$ 2.97 \n", - "Diluted $ (16.12)$ (0.57)$ 2.79\n", - "\n", - "\n" - ] - } - ], - "source": [ - "pretty_print(docs)" - ] - }, - { - "cell_type": "markdown", - "id": "8b0150fe-00dc-4aa0-9c8f-33cbf2ed5ac6", - "metadata": { - "id": "8b0150fe-00dc-4aa0-9c8f-33cbf2ed5ac6" - }, - "source": [ - "## Hybrid Search\n", - "LanceDB support hybrid search with custom Rerankers. Here's the summary of latency numbers of some of the Reranking methods available\n", - "![1_yWDh0Klw8Upsw1V54kkkdQ](https://github.com/AyushExel/assets/assets/15766192/a515fbf7-0553-437e-899e-67691eae3fef)\n", - "\n", - "Let us now perform hybrid search by combining vector and FTS search results. First, we'll cover the default Reranker.\n", - "\n", - "### Linear Combination Reranker\n", - "`LinearCombinationReranker(weight=0.7)` is used as the default reranker for reranking the hybrid search results if the reranker isn't specified explicitly.\n", - "The `weight` param controls the weightage provided to vector search score. The weight of `1-weight` is applied to FTS scores when reranking.\n", - "\n", - "Latency: `71 ms ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)`" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "d2aa5893-30c4-4beb-9dae-a55665bd82c7", - "metadata": { - "id": "d2aa5893-30c4-4beb-9dae-a55665bd82c7" - }, - "outputs": [], - "source": [ - "docs = table.search(query_type=\"hybrid\").vector(query).text(str_query).limit(5).to_pandas()[\"text\"].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "8d6a99c3-92ef-4677-96bb-9b54a11a79fe", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8d6a99c3-92ef-4677-96bb-9b54a11a79fe", - "outputId": "72f74e97-efb9-4bf2-c612-3dbbf33312bc" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatenedenforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", - "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", - "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", - "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", - "\n", - "\n", - "“Initial Delivery Date”); provided that the Pricing Certificate for any fiscal year may be delivered on any date following the Initial DeliveryDate that is prior to the date that is 365 days following the last day of the preceding fiscal year, so long as such Pricing Certificate includes acertification that delivery of such Pricing Certificate on or before the Initial Delivery Date was not possible because (i) the informationrequired to calculate the KPI Metrics for such preceding fiscal year was not available at such time or (ii) the report of the KPI Metrics Auditor,if relevant, was not available at such time (the date of the Administrative Agent’s receipt thereof, each a “Pricing Certificate Date”). Upondelivery of a Pricing Certificate in respect of a fiscal year, (i) the Applicable Rate for the Loans incurred by the Borrower shall be increased ordecreased (or neither increased nor decreased), as applicable, pursuant to the Sustainability Margin Adjustment as set forth in the KPI MetricsCertificate\n", - "\n", - "\n", - "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", - "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", - "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", - "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", - "General and Administrative\n", - "2021 2022 % Change\n", - "(in millions, except percentages)\n", - "General and administrative $ 836 $ 950 14 %\n", - "Percentage of revenue 14 % 11 %\n", - "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", - "\n", - "\n", - "(c) If, for any fiscal year, either (i) no Pricing Certificate shall have been delivered for such fiscal year or (ii) the PricingCertificate delivered for such fiscal year shall fail to include the Diverse Supplier Spend Percentage or GHG Emissions Intensity for suchfiscal year, then the Sustainability Margin Adjustment will be positive 0.050% and/or the Sustainability Fee Adjustment will be positive0.010%, as applicable, in each case commencing on the last day such Pricing Certificate could have been delivered in accordance with theterms of clause (a) above (it being understood that, in the case of the foregoing clause (ii), the Sustainability Margin Adjustment or theSustainability Fee Adjustment will be determined in accordance with such Pricing Certificate to the extent the (A) Sustainability MarginAdjustment or the Sustainability Fee Adjustment is included in such Pricing Certificate and (B) the Administrative Agent has separatelyreceived the Diverse Supplier Spend Percentage and/or GHG Emissions\n", - "\n", - "\n", - "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", - "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", - "• events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", - "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-relatedevents, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", - "climate change on seasonal destinations;\n", - "• political, social, or economic instability;\n", - "\n", - "\n" - ] - } - ], - "source": [ - "pretty_print(docs)" - ] - }, - { - "cell_type": "markdown", - "id": "c4d3e0f3-8d96-47f5-ad1d-514475f1ae55", - "metadata": { - "id": "c4d3e0f3-8d96-47f5-ad1d-514475f1ae55" - }, - "source": [ - "### Cohere Reranker\n", - "This uses Cohere's Reranking API to re-rank the results. It accepts the reranking model name as a parameter. By default it uses the english-v3 model but you can easily switch to a multi-lingual model.\n", - "\n", - "Latency: `605 ms ± 78.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)`" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "ce2c43c7-1a96-4856-ad9b-28385164f187", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ce2c43c7-1a96-4856-ad9b-28385164f187", - "outputId": "5316fdc4-8930-45aa-af1d-6f1faef4e97e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "··········\n" - ] - } - ], - "source": [ - "# Free API key\n", - "os.environ[\"COHERE_API_KEY\"] = getpass.getpass()" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "4adbb3f1-4d21-427b-9bf0-3d7bebf68cf6", - "metadata": { - "id": "4adbb3f1-4d21-427b-9bf0-3d7bebf68cf6" - }, - "outputs": [], - "source": [ - "from lancedb.rerankers import CohereReranker\n", - "\n", - "reranker = CohereReranker()\n", - "docs = table.search(query_type=\"hybrid\").vector(query).text(str_query).limit(5).rerank(reranker).to_pandas()[\"text\"].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "a071b3e7-3b8b-42e4-a089-4d6c4094873f", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a071b3e7-3b8b-42e4-a089-4d6c4094873f", - "outputId": "2d9066f3-8290-431d-ae08-0d17dad805f7" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Increased operating expenses, decreased revenue, negative publicity, negative reaction from our Hosts and guests and other stakeholders, or other adverse impacts from any of the\n", - "above factors or other risks related to our international operations could materially adversely affect our brand, reputation, business, results of operations, and financial condition.\n", - "In addition, we will continue to incur significant expenses to operate our outbound business in China, and we may never achieve profitability in that market. These factors, combined\n", - "with sentiment of the workforce in China, and China’s policy towards foreign direct investment may particularly impact our operations in China. In addition, we need to ensure that\n", - "our business practices in China are compliant with local laws and regulations, which may be interpreted and enforced in ways that are different from our interpretation, and/or create\n", - "\n", - "\n", - "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", - "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", - "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", - "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", - "General and Administrative\n", - "2021 2022 % Change\n", - "(in millions, except percentages)\n", - "General and administrative $ 836 $ 950 14 %\n", - "Percentage of revenue 14 % 11 %\n", - "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", - "\n", - "\n", - "• Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;• increased competition and use of our competitors’ platforms and services;\n", - "• Hosts failing to provide differentiated, high-quality, and an adequate supply of stays or experiences at competitive prices;\n", - "• guests not receiving timely and adequate community support from us;\n", - "• our failure to provide new or enhanced offerings, tiers, or features that guests value;\n", - "• declines or inefficiencies in our marketing efforts;• negative associations with, or reduced awareness of, our brand;\n", - "• actual or perceived discrimination by Hosts in deciding whether to accept a requested reservation;\n", - "• negative perceptions of the trust and safety on our platform; and\n", - "• macroeconomic and other conditions outside of our control affecting travel and hospitality industries generally.\n", - "\n", - "\n", - "Table of Contents\n", - "Airbnb, Inc.\n", - "Consolidated Statements of Operations\n", - "(in millions, except per share amounts)\n", - "Year Ended December 31,\n", - "2020 2021 2022\n", - "Revenue $ 3,378 $ 5,992 $ 8,399 \n", - "Costs and expenses:\n", - "Cost of revenue 876 1,156 1,499 \n", - "Operations and support 878 847 1,041 \n", - "Product development 2,753 1,425 1,502 \n", - "Sales and marketing 1,175 1,186 1,516 \n", - "General and administrative 1,135 836 950 \n", - "Restructuring charges 151 113 89 \n", - "Total costs and expenses 6,968 5,563 6,597 \n", - "Income (loss) from operations (3,590) 429 1,802 \n", - "Interest income 27 13 186 \n", - "Interest expense (172) (438) (24)\n", - "Other income (expense), net (947) (304) 25 \n", - "Income (loss) before income taxes (4,682) (300) 1,989 \n", - "Provision for (benefit from) income taxes (97) 52 96 \n", - "Net income (loss) $ (4,585)$ (352)$ 1,893 \n", - "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", - "Basic $ (16.12)$ (0.57)$ 2.97 \n", - "Diluted $ (16.12)$ (0.57)$ 2.79\n", - "\n", - "\n", - "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", - "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", - "• events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", - "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-relatedevents, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", - "climate change on seasonal destinations;\n", - "• political, social, or economic instability;\n", - "\n", - "\n" - ] - } - ], - "source": [ - "pretty_print(docs)" - ] - }, - { - "cell_type": "markdown", - "id": "6630f0c0-6070-4ea7-a191-99092e69ca05", - "metadata": { - "id": "6630f0c0-6070-4ea7-a191-99092e69ca05" - }, - "source": [ - "Relevance score is returned by Cohere API and is independent of individual FTS and vector search scores." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "80dc61bb-929c-4fbb-b2cb-20c5d31bc65c", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 293 - }, - "id": "80dc61bb-929c-4fbb-b2cb-20c5d31bc65c", - "outputId": "d09dab34-7756-4c58-8731-5683b3ca7044" - }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"table\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"vector\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"a91b3506-39a2-4b19-8409-08333d83a1c6\",\n \"1694d5a5-7ece-40b8-8022-dc3fa9aaa05a\",\n \"fcc532b9-347b-4e36-8ae8-5a2a726bf574\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\\nspend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\\nexpenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\\nthe changes in the fair value of contingent consideration related to a 2019 acquisition.\\nGeneral and Administrative\\n2021 2022 % Change\\n(in millions, except percentages)\\nGeneral and administrative $ 836 $ 950 14 %\\nPercentage of revenue 14 % 11 %\\nGeneral and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\",\n \"Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\\nadversely affected by a number of factors discussed elsewhere in these \\u201cRisk Factors,\\u201d including:\\n\\u2022 events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\\ndownturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-relatedevents, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\\nclimate change on seasonal destinations;\\n\\u2022 political, social, or economic instability;\",\n \"\\u2022 Hosts failing to meet guests\\u2019 expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;\\u2022 increased competition and use of our competitors\\u2019 platforms and services;\\n\\u2022 Hosts failing to provide differentiated, high-quality, and an adequate supply of stays or experiences at competitive prices;\\n\\u2022 guests not receiving timely and adequate community support from us;\\n\\u2022 our failure to provide new or enhanced offerings, tiers, or features that guests value;\\n\\u2022 declines or inefficiencies in our marketing efforts;\\u2022 negative associations with, or reduced awareness of, our brand;\\n\\u2022 actual or perceived discrimination by Hosts in deciding whether to accept a requested reservation;\\n\\u2022 negative perceptions of the trust and safety on our platform; and\\n\\u2022 macroeconomic and other conditions outside of our control affecting travel and hospitality industries generally.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"metadata\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"_relevance_score\",\n \"properties\": {\n \"dtype\": \"float32\",\n \"num_unique_values\": 5,\n \"samples\": [\n 0.9790357351303101,\n 0.5007786750793457,\n 0.961605966091156\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectoridtextmetadata_relevance_score
0[0.0034929817, -0.024774546, 0.012623285, -0.0...18d4a926-99d9-447f-8b57-264d7a148bd7Increased operating expenses, decreased revenu...{'page': 18, 'source': 'https://d18rn0p25nwr6d...0.985328
1[-0.0042489874, -0.005382498, 0.007190078, -0....a91b3506-39a2-4b19-8409-08333d83a1c6Made Possible by Hosts, Strangers, AirCover, C...{'page': 62, 'source': 'https://d18rn0p25nwr6d...0.979036
2[0.0076079983, -0.013340506, 0.018701892, -0.0...fcc532b9-347b-4e36-8ae8-5a2a726bf574• Hosts failing to meet guests’ expectations, ...{'page': 11, 'source': 'https://d18rn0p25nwr6d...0.961606
3[-0.008694107, -0.01993283, 0.014201017, -0.02...72b844e2-cc93-4495-bb67-c2c1a1fd6532Table of Contents\\nAirbnb, Inc.\\nConsolidated ...{'page': 72, 'source': 'https://d18rn0p25nwr6d...0.696578
4[0.005813433, -0.028278675, 0.018041687, -0.02...1694d5a5-7ece-40b8-8022-dc3fa9aaa05aOur success depends significantly on existing ...{'page': 11, 'source': 'https://d18rn0p25nwr6d...0.500779
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " vector \\\n", - "0 [0.0034929817, -0.024774546, 0.012623285, -0.0... \n", - "1 [-0.0042489874, -0.005382498, 0.007190078, -0.... \n", - "2 [0.0076079983, -0.013340506, 0.018701892, -0.0... \n", - "3 [-0.008694107, -0.01993283, 0.014201017, -0.02... \n", - "4 [0.005813433, -0.028278675, 0.018041687, -0.02... \n", - "\n", - " id \\\n", - "0 18d4a926-99d9-447f-8b57-264d7a148bd7 \n", - "1 a91b3506-39a2-4b19-8409-08333d83a1c6 \n", - "2 fcc532b9-347b-4e36-8ae8-5a2a726bf574 \n", - "3 72b844e2-cc93-4495-bb67-c2c1a1fd6532 \n", - "4 1694d5a5-7ece-40b8-8022-dc3fa9aaa05a \n", - "\n", - " text \\\n", - "0 Increased operating expenses, decreased revenu... \n", - "1 Made Possible by Hosts, Strangers, AirCover, C... \n", - "2 • Hosts failing to meet guests’ expectations, ... \n", - "3 Table of Contents\\nAirbnb, Inc.\\nConsolidated ... \n", - "4 Our success depends significantly on existing ... \n", - "\n", - " metadata _relevance_score \n", - "0 {'page': 18, 'source': 'https://d18rn0p25nwr6d... 0.985328 \n", - "1 {'page': 62, 'source': 'https://d18rn0p25nwr6d... 0.979036 \n", - "2 {'page': 11, 'source': 'https://d18rn0p25nwr6d... 0.961606 \n", - "3 {'page': 72, 'source': 'https://d18rn0p25nwr6d... 0.696578 \n", - "4 {'page': 11, 'source': 'https://d18rn0p25nwr6d... 0.500779 " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.search(query_type=\"hybrid\").vector(query).text(str_query).limit(5).rerank(reranker).to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "41147a46-7ef8-4266-9cec-08a992697de2", - "metadata": { - "id": "41147a46-7ef8-4266-9cec-08a992697de2" - }, - "source": [ - "### ColBERT Reranker\n", - "Colbert Reranker is powered by ColBERT model. It runs locally using the huggingface implementation.\n", - "\n", - "Latency - `950 ms ± 5.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)`\n", - "\n", - "Note: First query might be slow. It is recommended to reuse the `Reranker` objects as the models are cached. Subsequent runs will be faster on reusing the same reranker object" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "zsV14JRXB0Xs", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zsV14JRXB0Xs", - "outputId": "4900098d-0dd9-4a5b-9bec-14bad0a81f23" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting rerankers\n", - " Downloading rerankers-0.6.0-py3-none-any.whl.metadata (28 kB)\n", - "Requirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (from rerankers) (2.9.2)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from rerankers) (4.66.6)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic->rerankers) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic->rerankers) (2.23.4)\n", - "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic->rerankers) (4.12.2)\n", - "Downloading rerankers-0.6.0-py3-none-any.whl (41 kB)\n", - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/41.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.1/41.1 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: rerankers\n", - "Successfully installed rerankers-0.6.0\n" - ] - } - ], - "source": [ - "!pip install rerankers" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "91b06b43-c971-4177-b62f-f941bbbc2ef4", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 334, - "referenced_widgets": [ - "d7770b21d69c4c45b6779a0a79a8d9c2", - "054a0b2d45914048a8ec508d9803c56c", - "add7e408b1734ea9a1d046ce4c39ef9e", - "47d9fc2c4dc44cc8a4d0f4a75f942c8a", - "a503998424394f34ba6f7c112f1c6efa", - "55c408272dd2489a99c831ff20ec9340", - "045197c90d5a4ccf8c0221a4020235da", - "3de12712877c483e8bb984d6eb004847", - "b26c3c7727204c22bd53e0111d221b86", - "fa2e6681eb8a41f1ba4cb22d46e1b7bc", - "c1b1423adfe140ec992dcd1a4ad36b1b", - "55a4200d05054bb295dc66d24acb10e8", - "3b396c309a054c1b8d16dc3ee3a97483", - "56ca1e56bc534e0db28f7e353e58b994", - "75a5a250d8dd408295e2897c22486091", - "fb8ffbf0ab8d4f43a09ee3a11ccdb950", - "1951dc337f584cafac599de1b73288ad", - "9ecf864eed024658bd5201d04d462564", - "7a532995568348b7b608844198848c9f", - "44b7024056de482a858b87ea8c2188f1", - "9acf476e0a7c4df59abd7ef2972775bf", - "c18d77fb1b8541eb937bcf4308992645", - "880662def20c41d1a419774b19ca259d", - "fc696ae381a54f6780f676981716e0f9", - "61f71d9f955c460283524a992d0492e1", - "7b5c08fc36f24bda9c44a89595bce448", - "d4c9ee37d58440b8a69207c90df95736", - "5f2fdccfc6f0445dbd08d92b389d96fc", - "7b5540ac6b4c4fd783e37765f4c51d77", - "5a4c3f55d98d4a3e97bc3dfef3a55dcf", - "44ba4ebbc71d4226a22ec4fc23bc8768", - "24d18127b4404cddb12b4a611bd20be8", - "ceb13b8e32774db183607d1424b9474f", - "b1b529b66911404792162c32b26245a1", - "63473a0152974d22aaa9e9396ef3e728", - "5defb1cd326f4b709d465e2b48a9edef", - "cb2942f98e354d788ae32d01fc59f5d5", - "b4a3413fddb84b62878a66241dbd4efb", - "6fb0e36f0283419f85a2b842b5a2b921", - "b5d6eb4b788946a8bf02c7ddb868eb32", - "7f138f0139294632bb50d446c164c967", - "cde7e23ade984cae9271d5db2a34a197", - "0851ea7f08724be4aa539fc91d66c642", - "a3d614e482994e28bad9644a6787b8db", - "d32c13984f204801860c6ac4563061bb", - "b49348ffe6ad4c5988dbc951f0bd22fd", - "3176ffcda7554b989b0cd23ad340bd92", - "6879ceca561f49e487cf05a3e7920f60", - "df82d77a80c14b9382c8674447447327", - "1cce1e346ff34714a489da3df078a703", - "e0f6ca6fe4e14037a3d513fbce236449", - "5d93708be5a94059b98de774aa5daa71", - "cd3763632f854fe4ada2952ca09245fb", - "69b7322fdcbe49f8a667707e3e0a3249", - "317ed2e5ae444ac5ba4fc07d360644b0", - "a80141d12cb94a2ca9564dd32af3b261", - "622bc2e5edee47c494265d35d0287d39", - "4299ffb4264c46b281ec1dda63a67e5c", - "77d0016687644feabf563a66eb40da85", - "47f0601a33c24f06b3bb10c6cd8dd9b3", - "dfdb3742e4174ab7b67dd83a5ae99711", - "a900bdccbb5141699ab7954a83a06d18", - "09ea0032eddc47bb84bc7d7f3cf7a2d6", - "7f9eacfcf78b42aa91fbc2641050c91c", - "acb2518611a24967a85afb0b60c6e9f5", - "3c728cf1d7f842529116636c1e1c929a" - ] - }, - "id": "91b06b43-c971-4177-b62f-f941bbbc2ef4", - "outputId": "ad72e836-4832-4d25-f64c-e439ef9f37d1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)\n", - "No device set\n", - "Using device cpu\n", - "No dtype set\n", - "Using dtype torch.float32\n", - "Loading model colbert-ir/colbertv2.0, this might take a while...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7770b21d69c4c45b6779a0a79a8d9c2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/405 [00:00 pa.Table:\n", - " combined_results = self.merge(vector_results, fts_results) # Or custom merge algo\n", - " # Custom Reranking logic here\n", - "\n", - " return combined_results" - ] - }, - { - "cell_type": "markdown", - "id": "0606d4fb-96ef-4440-9363-f5461284d00c", - "metadata": { - "id": "0606d4fb-96ef-4440-9363-f5461284d00c" - }, - "source": [ - "### Custom Reranker based on CohereReranker\n", - "\n", - "For the sake of simplicity let's build a custom reranker that enhances the Cohere Reranker by accepting a filter query, and accepts other CohereReranker params as kwargs.\n", - "\n", - "For this toy example let's say we want to get rid of docs that represent a table of contents or appendix, as these are semantically close to representing costs but don't represent the specific reasons why operating costs were high." - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "dd1e8110-72c4-423c-90de-ce2b386742c1", - "metadata": { - "id": "dd1e8110-72c4-423c-90de-ce2b386742c1" - }, - "outputs": [], - "source": [ - "from typing import List, Union\n", - "import pandas as pd\n", - "from lancedb.rerankers import CohereReranker\n", - "\n", - "class MofidifiedCohereReranker(CohereReranker):\n", - " def __init__(self, filters: Union[str, List[str]], **kwargs):\n", - " super().__init__(**kwargs)\n", - " filters = filters if isinstance(filters, list) else [filters]\n", - " self.filters = filters\n", - "\n", - " def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table:\n", - " combined_result = super().rerank_hybrid(query, vector_results, fts_results)\n", - " df = combined_result.to_pandas()\n", - " for filter in self.filters:\n", - " df = df.query(\"not text.str.contains(@filter)\")\n", - "\n", - " return pa.Table.from_pandas(df)\n", - "\n", - "reranker = MofidifiedCohereReranker(filters=\"Table of Contents\")" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "f4e6b496-e0c1-4944-8a6d-127f566812d3", - "metadata": { - "id": "f4e6b496-e0c1-4944-8a6d-127f566812d3" - }, - "outputs": [], - "source": [ - "docs = table.search(query_type=\"hybrid\").vector(query).text(str_query).limit(5).rerank(reranker).to_pandas()[\"text\"].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "5a29d0a2-793a-40a2-ac2d-2edda1102d6e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5a29d0a2-793a-40a2-ac2d-2edda1102d6e", - "outputId": "2aaf2369-0aa8-463d-9b81-da686f8adac5" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Increased operating expenses, decreased revenue, negative publicity, negative reaction from our Hosts and guests and other stakeholders, or other adverse impacts from any of the\n", - "above factors or other risks related to our international operations could materially adversely affect our brand, reputation, business, results of operations, and financial condition.\n", - "In addition, we will continue to incur significant expenses to operate our outbound business in China, and we may never achieve profitability in that market. These factors, combined\n", - "with sentiment of the workforce in China, and China’s policy towards foreign direct investment may particularly impact our operations in China. In addition, we need to ensure that\n", - "our business practices in China are compliant with local laws and regulations, which may be interpreted and enforced in ways that are different from our interpretation, and/or create\n", - "\n", - "\n", - "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", - "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", - "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", - "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", - "General and Administrative\n", - "2021 2022 % Change\n", - "(in millions, except percentages)\n", - "General and administrative $ 836 $ 950 14 %\n", - "Percentage of revenue 14 % 11 %\n", - "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", - "\n", - "\n", - "• Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;• increased competition and use of our competitors’ platforms and services;\n", - "• Hosts failing to provide differentiated, high-quality, and an adequate supply of stays or experiences at competitive prices;\n", - "• guests not receiving timely and adequate community support from us;\n", - "• our failure to provide new or enhanced offerings, tiers, or features that guests value;\n", - "• declines or inefficiencies in our marketing efforts;• negative associations with, or reduced awareness of, our brand;\n", - "• actual or perceived discrimination by Hosts in deciding whether to accept a requested reservation;\n", - "• negative perceptions of the trust and safety on our platform; and\n", - "• macroeconomic and other conditions outside of our control affecting travel and hospitality industries generally.\n", - "\n", - "\n", - "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", - "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", - "• events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", - "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-relatedevents, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", - "climate change on seasonal destinations;\n", - "• political, social, or economic instability;\n", - "\n", - "\n", - "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatenedenforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", - "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", - "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", - "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", - "\n", - "\n" - ] - } - ], - "source": [ - "pretty_print(docs)" - ] - }, - { - "cell_type": "markdown", - "id": "b3b5464a-7252-4eab-aaac-9b0eae37496f", - "metadata": { - "id": "b3b5464a-7252-4eab-aaac-9b0eae37496f" - }, - "source": [ - "As you can see, the document containing the table of contents no longer shows up." - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/lancedb_reranking.ipynb b/docs/src/notebooks/lancedb_reranking.ipynb deleted file mode 100644 index dfaff205..00000000 --- a/docs/src/notebooks/lancedb_reranking.ipynb +++ /dev/null @@ -1,1507 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "b3Y3DOVqtIbc" - }, - "source": [ - "# Example - Improve Retrievers using Rerankers & Hybrid search\n", - "\n", - "## Optimizing RAG retrieval performance using hybrid search & reranking" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6gUUIxGP0n1Z", - "outputId": "0319735d-5986-470b-ad7a-3e6a9a4032f6" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.4/177.4 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m16.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.4/12.4 MB\u001b[0m \u001b[31m51.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], - "source": [ - "!pip install lancedb sentence-transformers cohere tantivy pyarrow==13.0.0 -q" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DQSVI4GSjU0b" - }, - "source": [ - "## What is a retriever\n", - "VectorDBs are used as retrievers in recommender or chatbot-based systems for retrieving relevant data based on user queries. For example, retriever is a critical component of Retrieval Augmented Generation (RAG) acrhitectures. In this section, we will discuss how to improve the performance of retrievers.\n", - "\n", - "\n", - "\n", - "[source](https://llmstack.ai/assets/images/rag-f517f1f834bdbb94a87765e0edd40ff2.png)\n", - "\n", - "## How do you go about improving retreival performance\n", - "Some of the common techniques are:\n", - "\n", - "- Using different search types - vector/semantic, FTS (BM25)\n", - "- Hybrid search\n", - "- Reranking\n", - "- Fine-tuning the embedding models\n", - "- Using different embedding models\n", - "\n", - "Obviously, the above list is not exhaustive. There are other subtler ways that can improve retrieval performance like alternative chunking algorithms, using different distance/similarity metrics, and more. For brevity, we'll only cover high level and more impactful techniques here.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3ZCm3-Bog9g7" - }, - "source": [ - "# LanceDB\n", - "- Multimodal DB for AI\n", - "- Powered by an innovative & open-source in-house file format\n", - "- Zero setup\n", - "- Scales up on disk storage\n", - "- Native support for vector, full-text(BM25) and hybrid search\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b1fzhbQc4O1u" - }, - "source": [ - "## The dataset\n", - "The dataset we'll use is a synthetic QA dataset generated from LLama2 review paper. The paper was divided into chunks, with each chunk being a unique context. An LLM was prompted to ask questions relevant to the context for testing a retriever.\n", - "The exact code and other utility functions for this can be found in [this](https://github.com/lancedb/ragged) repo.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f_qnH-Dfhi9Z", - "outputId": "1e22e1b1-a821-4ccb-ff30-1b2d6f8b824e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2024-07-24 14:22:47-- https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 680439 (664K) [text/plain]\n", - "Saving to: ‘data_qa.csv’\n", - "\n", - "data_qa.csv 100%[===================>] 664.49K --.-KB/s in 0.03s \n", - "\n", - "2024-07-24 14:22:48 (19.9 MB/s) - ‘data_qa.csv’ saved [680439/680439]\n", - "\n" - ] - } - ], - "source": [ - "!wget https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "ZNNAUc6f7ILI" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "data = pd.read_csv(\"data_qa.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 580 - }, - "id": "4Bp9Fdhz7QsM", - "outputId": "fdcbc090-d526-4dcb-98a2-c0d8090f295d" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " Unnamed: 0 query \\\n", - "0 0 How does the performance of Llama 2-Chat model... \n", - "1 1 What benefits does the enhancement and safety ... \n", - "2 2 How does one ensure the reliability and robust... \n", - "3 3 What methodologies are employed to align machi... \n", - "4 4 What are some of the primary insights gained f... \n", - ".. ... ... \n", - "215 215 How are the terms 'clean', 'not clean', 'dirty... \n", - "216 216 How does the size of the model influence the a... \n", - "217 217 What impact does the model contamination have ... \n", - "218 218 What are the different sizes and types availab... \n", - "219 219 Could you discuss the sustainability measures ... \n", - "\n", - " context \\\n", - "0 Llama 2 : Open Foundation and Fine-Tuned Chat ... \n", - "1 Llama 2 : Open Foundation and Fine-Tuned Chat ... \n", - "2 Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2... \n", - "3 Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2... \n", - "4 . . . . . . . . 23\\n4.3 Red Teaming . . . . . ... \n", - ".. ... \n", - "215 Giventhe\\nembarrassinglyparallelnatureofthetas... \n", - "216 Dataset Model Subset Type Avg. Contam. % n ¯X ... \n", - "217 Dataset Model Subset Type Avg. Contam. % n ¯X ... \n", - "218 A.7 Model Card\\nTable 52 presents a model card... \n", - "219 A.7 Model Card\\nTable 52 presents a model card... \n", - "\n", - " answer \n", - "0 Llama 2-Chat models have shown to exceed the p... \n", - "1 The safety and enhancement measures implemente... \n", - "2 In the initial steps of model development, the... \n", - "3 Machine learning models can be aligned with de... \n", - "4 The key insights gained from evaluating platfo... \n", - ".. ... \n", - "215 In the discussed dataset analysis, samples are... \n", - "216 The size of the model significantly influences... \n", - "217 Model contamination affects various contaminat... \n", - "218 Llama 2 is available in three distinct paramet... \n", - "219 Throughout the training of Llama 2, which invo... \n", - "\n", - "[220 rows x 4 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0querycontextanswer
00How does the performance of Llama 2-Chat model...Llama 2 : Open Foundation and Fine-Tuned Chat ...Llama 2-Chat models have shown to exceed the p...
11What benefits does the enhancement and safety ...Llama 2 : Open Foundation and Fine-Tuned Chat ...The safety and enhancement measures implemente...
22How does one ensure the reliability and robust...Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2...In the initial steps of model development, the...
33What methodologies are employed to align machi...Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2...Machine learning models can be aligned with de...
44What are some of the primary insights gained f.... . . . . . . . 23\\n4.3 Red Teaming . . . . . ...The key insights gained from evaluating platfo...
...............
215215How are the terms 'clean', 'not clean', 'dirty...Giventhe\\nembarrassinglyparallelnatureofthetas...In the discussed dataset analysis, samples are...
216216How does the size of the model influence the a...Dataset Model Subset Type Avg. Contam. % n ¯X ...The size of the model significantly influences...
217217What impact does the model contamination have ...Dataset Model Subset Type Avg. Contam. % n ¯X ...Model contamination affects various contaminat...
218218What are the different sizes and types availab...A.7 Model Card\\nTable 52 presents a model card...Llama 2 is available in three distinct paramet...
219219Could you discuss the sustainability measures ...A.7 Model Card\\nTable 52 presents a model card...Throughout the training of Llama 2, which invo...
\n", - "

220 rows × 4 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "data", - "summary": "{\n \"name\": \"data\",\n \"rows\": 220,\n \"fields\": [\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 63,\n \"min\": 0,\n \"max\": 219,\n \"num_unique_values\": 220,\n \"samples\": [\n 132,\n 148,\n 93\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 220,\n \"samples\": [\n \"What type of examination did scholars perform on ChatGPT, and when was the resulting scholarly paper published?\",\n \"How do the performance capabilities of the different models compare in evaluating tasks associated with logical reasoning and reading comprehension, specifically noted in tests like LSAT and SAT?\",\n \"What steps are recommended for users to ensure the responsible use of AI models like Llama 2 in projects or commercial applications?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"context\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 110,\n \"samples\": [\n \"Dialogue Turn Baseline + GAtt\\n2 100% 100%\\n4 10% 100%\\n6 0% 100%\\n20 0% 100%\\nTable30: GAttresults. Llama 2-Chat withGAttisabletorefertoattributes100%ofthetime,forupto20\\nturns from our human evaluation. We limited the evaluated attributes to public figures and hobbies.\\nTheattentionnowspansbeyond20turns. Wetestedthemodelabilitytorememberthesystemarguments\\ntroughahumanevaluation. Thearguments(e.g. hobbies,persona)aredefinedduringthefirstmessage,and\\nthen from turn 2 to 20. We explicitly asked the model to refer to them (e.g. \\u201cWhat is your favorite hobby?\\u201d,\\n\\u201cWhatisyourname?\\u201d),tomeasurethemulti-turnmemoryabilityof Llama 2-Chat . Wereporttheresults\\ninTable30. EquippedwithGAtt, Llama 2-Chat maintains100%accuracy,alwaysreferringtothedefined\\nattribute,andso,upto20turns(wedidnotextendthehumanevaluationmore,andalltheexampleshad\\nlessthan4048tokensintotalovertheturns). Asacomparison, Llama 2-Chat withoutGAttcannotanymore\\nrefer to the attributes after only few turns: from 100% at turn t+1, to 10% at turn t+3 and then 0%.\\nGAttZero-shotGeneralisation. Wetriedatinferencetimetosetconstrainnotpresentinthetrainingof\\nGAtt. For instance, \\u201canswer in one sentence only\\u201d, for which the model remained consistent, as illustrated in\\nFigure 28.\\nWe applied first GAtt to Llama 1 , which was pretrained with a context length of 2048 tokens and then\\nfine-tuned with 4096 max length. We tested if GAtt works beyond 2048 tokens, and the model arguably\\nmanaged to understand attributes beyond this window. This promising result indicates that GAtt could be\\nadapted as an efficient technique for long context attention.\\nA.3.6 How Far Can Model-Based Evaluation Go?\\nTo measure the robustness of our reward model, we collected a test set of prompts for both helpfulness and\\nsafety,andaskedannotatorstojudgequalityoftheanswersbasedona7pointLikert-scale(thehigherthe\\nbetter)usingtriplereviews. AsillustratedinFigure29(inAppendix),weobservethatourrewardmodels\\noverallarewellcalibratedwithhumanpreference. Notethatthisenablesustousetherewardasapoint-wise\\nmetric, despite being trained with a Pairwise Ranking Loss.\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0Reward Model ScoreNo Margin\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0\\nMargin Small\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0\\nMargin Large\\nFigure 27: Reward model score distribution shift caused by incorporating preference rating based margin\\ninrankingloss. Withthemarginterm, weobserveabinary splitpatterninrewarddistribution, especially\\nwith a larger margin.\\n54\",\n \"Model Size CodeCommonsense\\nReasoningWorld\\nKnowledgeReading\\nComprehensionMath MMLU BBH AGI Eval\\nMPT7B 20.5 57.4 41.0 57.5 4.9 26.8 31.0 23.5\\n30B 28.9 64.9 50.0 64.7 9.1 46.9 38.0 33.8\\nFalcon7B 5.6 56.1 42.8 36.0 4.6 26.2 28.0 21.2\\n40B 15.2 69.2 56.7 65.7 12.6 55.4 37.1 37.0\\nLlama 17B 14.1 60.8 46.2 58.5 6.95 35.1 30.3 23.9\\n13B 18.9 66.1 52.6 62.3 10.9 46.9 37.0 33.9\\n33B 26.0 70.0 58.4 67.6 21.4 57.8 39.8 41.7\\n65B 30.7 70.7 60.5 68.6 30.8 63.4 43.5 47.6\\nLlama 27B 16.8 63.9 48.9 61.3 14.6 45.3 32.6 29.3\\n13B 24.5 66.9 55.4 65.8 28.7 54.8 39.4 39.1\\n34B 27.8 69.9 58.7 68.0 24.2 62.6 44.1 43.4\\n70B37.5 71.9 63.6 69.4 35.2 68.9 51.2 54.2\\nTable3: Overallperformanceongroupedacademicbenchmarkscomparedtoopen-sourcebasemodels.\\n\\u2022Popular Aggregated Benchmarks . We report the overall results for MMLU (5 shot) (Hendrycks\\net al., 2020), Big Bench Hard (BBH) (3 shot) (Suzgun et al., 2022), and AGI Eval (3\\u20135 shot) (Zhong\\net al., 2023). For AGI Eval, we only evaluate on the English tasks and report the average.\\nAs shown in Table 3, Llama 2 models outperform Llama 1 models. In particular, Llama 2 70B improves the\\nresultsonMMLUandBBHby \\u22485and\\u22488points,respectively,comparedto Llama 1 65B.Llama 2 7Band30B\\nmodelsoutperformMPTmodelsofthecorrespondingsizeonallcategoriesbesidescodebenchmarks. Forthe\\nFalcon models, Llama 2 7B and 34B outperform Falcon 7B and 40B models on all categories of benchmarks.\\nAdditionally, Llama 2 70B model outperforms all open-source models.\\nIn addition to open-source models, we also compare Llama 2 70B results to closed-source models. As shown\\nin Table 4, Llama 2 70B is close to GPT-3.5 (OpenAI, 2023) on MMLU and GSM8K, but there is a significant\\ngaponcodingbenchmarks. Llama 2 70BresultsareonparorbetterthanPaLM(540B)(Chowdheryetal.,\\n2022)onalmostallbenchmarks. Thereisstillalargegapinperformancebetween Llama 2 70BandGPT-4\\nand PaLM-2-L.\\nWe also analysed the potential data contamination and share the details in Section A.6.\",\n \"Figure 1: Helpfulness human evaluation results for Llama\\n2-Chatcomparedtootheropen-sourceandclosed-source\\nmodels. Human raters compared model generations on ~4k\\npromptsconsistingofbothsingleandmulti-turnprompts.\\nThe95%confidenceintervalsforthisevaluationarebetween\\n1%and2%. MoredetailsinSection3.4.2. Whilereviewing\\nthese results, it is important to note that human evaluations\\ncanbenoisyduetolimitationsofthepromptset,subjectivity\\nof the review guidelines, subjectivity of individual raters,\\nand the inherent difficulty of comparing generations.\\nFigure 2: Win-rate % for helpfulness and\\nsafety between commercial-licensed base-\\nlines and Llama 2-Chat , according to GPT-\\n4. Tocomplementthehumanevaluation,we\\nused a more capable model, not subject to\\nourownguidance. Greenareaindicatesour\\nmodelisbetteraccordingtoGPT-4. Toremove\\nties, we used win/ (win+loss). The orders in\\nwhichthemodelresponsesarepresentedto\\nGPT-4arerandomlyswappedtoalleviatebias.\\n1 Introduction\\nLarge Language Models (LLMs) have shown great promise as highly capable AI assistants that excel in\\ncomplex reasoning tasks requiring expert knowledge across a wide range of fields, including in specialized\\ndomains such as programming and creative writing. They enable interaction with humans through intuitive\\nchat interfaces, which has led to rapid and widespread adoption among the general public.\\nThecapabilitiesofLLMsareremarkableconsideringtheseeminglystraightforwardnatureofthetraining\\nmethodology. Auto-regressivetransformersarepretrainedonanextensivecorpusofself-superviseddata,\\nfollowed by alignment with human preferences via techniques such as Reinforcement Learning with Human\\nFeedback(RLHF).Althoughthetrainingmethodologyissimple,highcomputationalrequirementshave\\nlimited the development of LLMs to a few players. There have been public releases of pretrained LLMs\\n(such as BLOOM (Scao et al., 2022), LLaMa-1 (Touvron et al., 2023), and Falcon (Penedo et al., 2023)) that\\nmatch the performance of closed pretrained competitors like GPT-3 (Brown et al., 2020) and Chinchilla\\n(Hoffmann et al., 2022), but none of these models are suitable substitutes for closed \\u201cproduct\\u201d LLMs, such\\nasChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyfine-tunedtoalignwithhuman\\npreferences, which greatly enhances their usability and safety. This step can require significant costs in\\ncomputeandhumanannotation,andisoftennottransparentoreasilyreproducible,limitingprogresswithin\\nthe community to advance AI alignment research.\\nIn this work, we develop and release Llama 2, a family of pretrained and fine-tuned LLMs, Llama 2 and\\nLlama 2-Chat , at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested,\\nLlama 2-Chat models generally perform better than existing open-source models. They also appear to\\nbe on par with some of the closed-source models, at least on the human evaluations we performed (see\\nFigures1and3). Wehavetakenmeasurestoincreasethesafetyofthesemodels,usingsafety-specificdata\\nannotation and tuning, as well as conducting red-teaming and employing iterative evaluations. Additionally,\\nthispapercontributesathoroughdescriptionofourfine-tuningmethodologyandapproachtoimproving\\nLLM safety. We hope that this openness will enable the community to reproduce fine-tuned LLMs and\\ncontinue to improve the safety of those models, paving the way for more responsible development of LLMs.\\nWealsosharenovelobservationswemadeduringthedevelopmentof Llama 2 andLlama 2-Chat ,suchas\\nthe emergence of tool usage and temporal organization of knowledge.\\n3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 220,\n \"samples\": [\n \"Scholars performed a diagnostic analysis to investigate the AI ethics associated with ChatGPT. Their findings were compiled into a research paper that became accessible as a preprint on arXiv in January 2023.\",\n \"The MPT 30B model demonstrates considerable proficiency in logical reasoning and reading comprehension tasks, scoring highly on LSAT-LR, LSAT-RC, and SAT-en tests compared to its peers, such as Falcon 40B and Llama 17B. This is indicative of its advanced analytical and comprehension abilities. Conversely, while Falcon 40B shows strengths in LSAT-LR with a score second only to MPT 30B, it trails in SAT-en performance. This variability underscores the diverse capabilities of models based on their structural design and training paradigms.\",\n \"Users intending to deploy models like Llama 2 are advised to strictly adhere to guidelines laid out in the Responsible Use Guide. This includes employing enhanced safety measures at both the input and output stages of model interaction, as well as carefully tuning the model according to specific use-case requirements to prevent any potential misuse. Additionally, users must comply with the terms set in the Acceptable Use Policy, ensuring their applications do not contravene applicable laws, regulations, and ethical standards. Leveraging provided code examples can further assist developers in replicating the necessary safety protocols and maintaining ethical integrity in their applications.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 4 - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DJ1eG8XvPblc" - }, - "source": [ - "## Ingestion\n", - "Let us now ingest the contexts in LanceDB. The steps will be:\n", - "\n", - "- Create a schema (Pydantic or Pyarrow)\n", - "- Select an embedding model from LanceDB Embedding API (to allow automatic vectorization of data)\n", - "- Ingest the contexts\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 336, - "referenced_widgets": [ - "7d93a81fcd5f4f9c8952396a9f72be02", - "ae8bc663ba0e44ddb830a5b50b2e92f8", - "de0d2e4cb7b346a4ac0b55b095caff98", - "52441756f18a4a52a2a6c839c4ff892c", - "d143d1522f564b78a24e92bd0290f4b5", - "a4decda69da348dfa0f0ec38c5ceb9d6", - "1b95c82d481b4159bf7be3aefa4c0258", - "7e49893b47174c138237e9a29584c0d0", - "e36ba6906dd74973a48dba81ebb1f799", - "f3e98d664e2441ad9198ee0ee947b27e", - "fcf4a6a5fd3a45908a7727c4abefde44", - "dec5401f1de14ba690c3e829fe4fe0ae", - "19f5d1e903ea4f4faf62c179e7669234", - "74ad6e3b67554d33ac6422de1e3e475a", - "59915c4b816e420b942e6b0996c279f1", - "f5e61067348c4b01b3c0ba09e5a52f87", - "4a5fdf23f24c4f21896705734bc1e031", - "3c419d422bd34cec867538615193558a", - "4a6a4f7be2e44ade93e46702f037ebc1", - "c460e5a56c624f43baaed1fff6aa72e2", - "8a4e12a8f5da4f9990498c562a94116c", - "c2253beb48cd4ce7a2eb3f1ce130f520", - "1a1832edb11e4363af5a0c55ba013e91", - "5e0cd1901ad444d28336c69d75b84e91", - "1797c51500d0425496e390ebcf9729ac", - "4993c83bb4eb429797bb9928a7c86547", - "8f2f21ce257f457e962624bba5c3ca71", - "c64af78fb7424710ac50c20293719123", - "e2189d31c33f4d02983681d814d7ec28", - "2c6fcec076ae49fb9def49c54169e0e3", - "40782ef0cbfc4b24839015796c303869", - "674ddce0b8cc4fad93c863469fb7496c", - "2115bc215a574a4c90accc8e643ccc5c", - "4a3f41c780d940eda128cc1efe82cb46", - "b18a507ffc8e4b0ba825fdecac8980e1", - "796ba03995264bddad91fcc999e0f073", - "083b7d262f8945cc9b8fe928dbf9cee1", - "aeddd98812264d939c86b026d63682f0", - "ac96917d1e2341a5acf2b0236344b57d", - "20b2149ba0984c23ad726cc72f21ae6d", - "640859301f01490eb5041bd73667bdd0", - "be70a0355abf4bed9c8b887917721879", - "5bbbad4117bd4949adf34d58bb29d312", - "e640e77458964a1ba16654655bbe4ee1", - "31f10c25646949868814da02b21c8de2", - "eac57eb6d624437b9c04cddcdf1f53f4", - "0fb15815e4724685904aada95b00b1b5", - "cb45a898fb1745ff94bce928c64bfab5", - "bdacf681835145dda8867513e301f403", - "9b49cbad496d4e9ab5af69668e842ba6", - "6d1a94ae94e548058d34a5df5dbd563d", - "455abd9dfd614a53b4fa35f55542a9e9", - "878ca6976a414ba2a43e86dbf75ce45c", - "6133892fd2da4778bc0cc08667cb2673", - "4f30757e8c7f4b4b925d249a5369ed51", - "7c8689c8604c4626b6092d33d80ff6cd", - "f21f2f75e7ea42d5a6979ee722e80fe2", - "019aa571fc8f4f40afd65f728327e0b4", - "9b2206d9cd6d415bb4ce0d806aaaf473", - "50f3506a5bbc4a529d20b3f85fa78260", - "a03c9bd4a5e74a6182bbe774411899a0", - "2da0ec344438406ca44fa75e6523867b", - "4164c24fbe004abbbb7312da0682c8fc", - "db17d3f6979d4ba2b8184603f226e910", - "19d1e2c150f44fa08941117a42a2505b", - "d23e52576ac74f3a9962ff68f786343b" - ] - }, - "id": "B_g5pIkBQ66h", - "outputId": "ff31e6b0-745c-4e90-9da7-9e8c7b3c9b6f" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/366 [00:00 float:\n", - " eval_results = []\n", - " for idx in tqdm.tqdm(range(len(ds))):\n", - " query = ds[\"query\"][idx]\n", - " reference_context = ds[\"context\"][idx]\n", - " if not reference_context:\n", - " print(\"reference_context is None for query: {idx}. \\\n", - " Skipping this query. Please check your dataset.\")\n", - " continue\n", - " try:\n", - " rs = search_table(table, reranker, query_type, query, top_k)\n", - " except Exception as e:\n", - " print(f'Error with query: {idx} {e}')\n", - " eval_results.append({\n", - " 'is_hit': False,\n", - " 'retrieved': [],\n", - " 'expected': reference_context,\n", - " 'query': query,\n", - " })\n", - " continue\n", - " retrieved_texts = rs['text'].tolist()[:top_k]\n", - " expected_text = reference_context[0] if isinstance(reference_context, list) else reference_context\n", - " is_hit = False\n", - "\n", - " # HACK: to handle new line characters added my llamaindex doc reader\n", - " if expected_text in retrieved_texts or expected_text+'\\n' in retrieved_texts:\n", - " is_hit = True\n", - " eval_result = {\n", - " 'is_hit': is_hit,\n", - " 'retrieved': retrieved_texts,\n", - " 'expected': expected_text,\n", - " 'query': query,\n", - " }\n", - " eval_results.append(eval_result)\n", - "\n", - " result = pd.DataFrame(eval_results)\n", - " hit_rate = result['is_hit'].mean()\n", - " return hit_rate" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iZzAVl2kJ5mV", - "outputId": "0f4d6e5b-3096-4f58-fc36-c7909b475cfc" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 220/220 [00:10<00:00, 21.62it/s]\n", - "100%|██████████| 220/220 [00:00<00:00, 358.03it/s]" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - " Vector Search Hit Rate: 0.6409090909090909\n", - "FTS Search Hit Rate: 0.5954545454545455\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\n" - ] - } - ], - "source": [ - "tbl.create_fts_index(\"text\", replace=True)\n", - "hit_rate_vector = hit_rate(data, tbl, \"vector\")\n", - "hit_rate_fts = hit_rate(data, tbl, \"fts\")\n", - "print(f\"\\n Vector Search Hit Rate: {hit_rate_vector}\")\n", - "print(f\"FTS Search Hit Rate: {hit_rate_fts}\")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "2. Reranked vector search\n" - ], - "metadata": { - "id": "-1B5OPDuI8NE" - } - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "ngbS5kvnI6N_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Efmb9Gi2s9lD" - }, - "source": [ - "## Hybrid Search\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ydLNeAr4acYj", - "outputId": "0e455b2f-a10c-4ad2-ce36-52a90829dd10" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 220/220 [00:10<00:00, 20.60it/s]" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - " Hybrid Search with LinearCombinationReranker Hit Rate: 0.6454545454545455\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\n" - ] - } - ], - "source": [ - "from lancedb.rerankers import LinearCombinationReranker # LanceDB hybrid search uses LinearCombinationReranker by default\n", - "\n", - "reranker = LinearCombinationReranker(weight=0.7)\n", - "hit_rate_hybrid = hit_rate(data, tbl, \"hybrid\", reranker=reranker)\n", - "\n", - "print(f\"\\n Hybrid Search with LinearCombinationReranker Hit Rate: {hit_rate_hybrid}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Wswq157ptjTZ" - }, - "source": [ - "## Trying out different rerankers\n", - "\n", - "### 1. Cross Encoder Reranker\n", - "\n", - " \n", - "\n", - "Bi-Encoders produce for a given sentence a sentence embedding. We pass to a BERT independently the sentences A and B, which result in the sentence embeddings u and v. These sentence embedding can then be compared using cosine similarity.\n", - "\n", - "In contrast, for a Cross-Encoder, we pass both sentences simultaneously to the Transformer network. It produces then an output value between 0 and 1 indicating the similarity of the input sentence pair:\n", - "\n", - "A Cross-Encoder does not produce a sentence embedding. Also, we are not able to pass individual sentences to a Cross-Encoder." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 248, - "referenced_widgets": [ - "d718ecc1163942e4a5dd4cea73a90e8e", - "b33b19b3d1984052a7f189045a8cf881", - "ec4f2cb69b034876a42404016cd56336", - "299a94f5d724418cbfc3ff570f6fc51e", - "09d3de90349c4402a763ca9ee05872f1", - "f074dee7a025499ab26a16b811967b0d", - "5dc2560eb8d441c5b3c19f4dbb082402", - "4d055c5e91a14c789d53c41027c13f79", - "45a86acf89d74e88b05b236064abfe9e", - "d618cebde90545fe9d99255511dd842e", - "8ab0e516471a4b82b205227692a9c08c", - "77ed41da01bd4032aef1ad5d471e49b8", - "fd45203f991d420cbca1de00404fb92d", - "270e8fc6c4ed4583ad78e53a6048af39", - "4e8e3b7d32a542cc8b8c60bddf03b2fd", - "a72c96893f65478f8a618c6bed76a5a6", - "d6f67e57cbf64403844ac492fe33a37c", - "e71a9e5429a24743bb2f6672a675a0ea", - "8909f3170d344efdb39f5d95cc388606", - "54658b4b7ab543209f4de19a0b7c7477", - "a11e3a6b07114d56bb7ea4ff54f2dfba", - "036c0495ef404234a99a0d0945bfb137", - "481da1832f694ef6ac05e4d3efd67ac2", - "3ddd20f3287d4c53b9a8aa9dc376cf3b", - "2d23114ea17646069d8d228775f503a2", - "1583bd5518554acdb4201da0262dce80", - "cc1ca1cd82864701b376cb77f62bb189", - "6c95c1d838794b81a0ca58d97fc4d4bd", - "e7d30ffd3a5d4029bc94371e89d39df2", - "063e9d8117864292a1f4f7db6bf39fc5", - "4098367692a34e59a5f7875546187471", - "9fe27f7c08fd4e4690a0ed2a289176ee", - "e48ed4e228804a56af1b995b1533eda7", - "bc0b3747df404a2ea47be94b90b1bd96", - "17d0c553a19d4b55a03a95b68d24734f", - "c11f882bb2e04059b92006609471be1c", - "bfff2a4f749c46aaa2c1a0a131f13ca4", - "72cb183fdc3c444d85a08aa378e48a78", - "7fab79549d3a4bc2a2a38e761c85e3bd", - "48237a9c806440e0b100f52445502db7", - "0f48274c81ee44bb807a7d75ee0762fd", - "fc242de73006403886268a2ca9913375", - "387599dd3ff9491a8527a3a90e612c82", - "0f1a6b08ce6747c383c080ccd8c7783a", - "b6448167f80b40508070d3aa8cbc2ea0", - "24f579f32a3f48559ecc9b38f39f77cd", - "6e58fc22c84c48ba9d38770bf6665ee2", - "be452568af594df5884cbebbb23b1a47", - "6db85f74ef4c4edb8667d373bd3f96a4", - "be30e00481f24011a444511c896336d3", - "2d14ed52a93d4299b5de69c89e74ce13", - "7e196a41516f4550aa0b94d042d59756", - "4ba2b40b6b7945dfba573c9f80465155", - "7e761767401c4d9b97c06237b1e3b6eb", - "e5765e9cf70240c4ac11bdce2d1eaac8" - ] - }, - "id": "dd0jh4gNtm41", - "outputId": "aa734304-533a-4061-f6e4-17ee167b1933" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\r 0%| | 0/220 [00:00\n", - "\n", - " \n", - " 2. **Social media personalised timeline**\n", - " \n", - "\n", - " 3. **Recommend blogs, videos, etc. via push notifications**\n", - "\n", - " \"YouTube now gives notifications for \"recommended\", non-subscribed channels\" - https://www.reddit.com/r/assholedesign/comments/807zpe/youtube_now_gives_notifications_for_recommended/\n", - " \n", - " \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PJ0qSdCgCGi4" - }, - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/src/notebooks/langchain_example.ipynb b/docs/src/notebooks/langchain_example.ipynb deleted file mode 100644 index 10a48b4f..00000000 --- a/docs/src/notebooks/langchain_example.ipynb +++ /dev/null @@ -1,566 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "683953b3", - "metadata": {}, - "source": [ - "# LanceDB\n", - "\n", - ">[LanceDB](https://lancedb.com/) is an open-source database for vector-search built with persistent storage, which greatly simplifies retrevial, filtering and management of embeddings. Fully open source.\n", - "\n", - "This notebook shows how to use functionality related to the `LanceDB` vector database based on the Lance data format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1051ba9", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install tantivy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88ac92c0", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install -U langchain-openai langchain-community" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a1c84d6-a10f-428c-95cd-46d3a1702e07", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install lancedb" - ] - }, - { - "cell_type": "markdown", - "id": "99134dd1-b91e-486f-8d90-534248e43b9d", - "metadata": {}, - "source": [ - "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "a0361f5c-e6f4-45f4-b829-11680cf03cec", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import getpass\n", - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d114ed78", - "metadata": {}, - "outputs": [], - "source": [ - "! rm -rf /tmp/lancedb" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a3c3999a", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import TextLoader\n", - "from langchain_community.vectorstores import LanceDB\n", - "from langchain_openai import OpenAIEmbeddings\n", - "from langchain_text_splitters import CharacterTextSplitter\n", - "\n", - "loader = TextLoader(\"../../how_to/state_of_the_union.txt\")\n", - "documents = loader.load()\n", - "\n", - "documents = CharacterTextSplitter().split_documents(documents)\n", - "embeddings = OpenAIEmbeddings()" - ] - }, - { - "cell_type": "markdown", - "id": "e9517bb0", - "metadata": {}, - "source": [ - "##### For LanceDB cloud, you can invoke the vector store as follows :\n", - "\n", - "\n", - "```python\n", - "db_url = \"db://lang_test\" # url of db you created\n", - "api_key = \"xxxxx\" # your API key\n", - "region=\"us-east-1-dev\" # your selected region\n", - "\n", - "vector_store = LanceDB(\n", - " uri=db_url,\n", - " api_key=api_key,\n", - " region=region,\n", - " embedding=embeddings,\n", - " table_name='langchain_test'\n", - " )\n", - "```\n", - "\n", - "You can also add `region`, `api_key`, `uri` to `from_documents()` classmethod\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6e104aee", - "metadata": {}, - "outputs": [], - "source": [ - "from lancedb.rerankers import LinearCombinationReranker\n", - "\n", - "reranker = LinearCombinationReranker(weight=0.3)\n", - "\n", - "docsearch = LanceDB.from_documents(documents, embeddings, reranker=reranker)\n", - "query = \"What did the president say about Ketanji Brown Jackson\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "259c7988", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "relevance score - 0.7066475030191711\n", - "text- They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n", - "\n", - "Officer Mora was 27 years old. \n", - "\n", - "Officer Rivera was 22. \n", - "\n", - "Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n", - "\n", - "I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n", - "\n", - "I’ve worked on these issues a long time. \n", - "\n", - "I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. \n", - "\n", - "So let’s not abandon our streets. Or choose between safety and equal justice. \n", - "\n", - "Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. \n", - "\n", - "That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers. \n", - "\n", - "That’s why the American Rescue \n" - ] - } - ], - "source": [ - "docs = docsearch.similarity_search_with_relevance_scores(query)\n", - "print(\"relevance score - \", docs[0][1])\n", - "print(\"text- \", docs[0][0].page_content[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "9fa29dae", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "distance - 0.30000001192092896\n", - "text- My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", - "\n", - "Our troops in Iraq and Afghanistan faced many dangers. \n", - "\n", - "One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \n", - "\n", - "When they came home, many of the world’s fittest and best trained warriors were never the same. \n", - "\n", - "Headaches. Numbness. Dizziness. \n", - "\n", - "A cancer that would put them in a flag-draped coffin. \n", - "\n", - "I know. \n", - "\n", - "One of those soldiers was my son Major Beau Biden. \n", - "\n", - "We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \n", - "\n", - "But I’m committed to finding out everything we can. \n", - "\n", - "Committed to military families like Danielle Robinson from Ohio. \n", - "\n", - "The widow of Sergeant First Class Heath Robinson. \n", - "\n", - "He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n", - "\n", - "Stationed near Baghdad, just ya\n" - ] - } - ], - "source": [ - "docs = docsearch.similarity_search_with_score(query=\"Headaches\", query_type=\"hybrid\")\n", - "print(\"distance - \", docs[0][1])\n", - "print(\"text- \", docs[0][0].page_content[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e70ad201", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reranker : \n" - ] - } - ], - "source": [ - "print(\"reranker : \", docsearch._reranker)" - ] - }, - { - "cell_type": "markdown", - "id": "f5e1cdfd", - "metadata": {}, - "source": [ - "Additionaly, to explore the table you can load it into a df or save it in a csv file: \n", - "```python\n", - "tbl = docsearch.get_table()\n", - "print(\"tbl:\", tbl)\n", - "pd_df = tbl.to_pandas()\n", - "# pd_df.to_csv(\"docsearch.csv\", index=False)\n", - "\n", - "# you can also create a new vector store object using an older connection object:\n", - "vector_store = LanceDB(connection=tbl, embedding=embeddings)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "9c608226", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "metadata : {'source': '../../how_to/state_of_the_union.txt'}\n", - "\n", - "SQL filtering :\n", - "\n", - "They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n", - "\n", - "Officer Mora was 27 years old. \n", - "\n", - "Officer Rivera was 22. \n", - "\n", - "Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n", - "\n", - "I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n", - "\n", - "I’ve worked on these issues a long time. \n", - "\n", - "I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. \n", - "\n", - "So let’s not abandon our streets. Or choose between safety and equal justice. \n", - "\n", - "Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. \n", - "\n", - "That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers. \n", - "\n", - "That’s why the American Rescue Plan provided $350 Billion that cities, states, and counties can use to hire more police and invest in proven strategies like community violence interruption—trusted messengers breaking the cycle of violence and trauma and giving young people hope. \n", - "\n", - "We should all agree: The answer is not to Defund the police. The answer is to FUND the police with the resources and training they need to protect our communities. \n", - "\n", - "I ask Democrats and Republicans alike: Pass my budget and keep our neighborhoods safe. \n", - "\n", - "And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home—they have no serial numbers and can’t be traced. \n", - "\n", - "And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon? \n", - "\n", - "Ban assault weapons and high-capacity magazines. \n", - "\n", - "Repeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. \n", - "\n", - "These laws don’t infringe on the Second Amendment. They save lives. \n", - "\n", - "The most fundamental right in America is the right to vote – and to have it counted. And it’s under assault. \n", - "\n", - "In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \n", - "\n", - "We cannot let this happen. \n", - "\n", - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n", - "\n", - "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", - "\n", - "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", - "\n", - "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", - "\n", - "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", - "\n", - "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster.\n" - ] - } - ], - "source": [ - "docs = docsearch.similarity_search(\n", - " query=query, filter={\"metadata.source\": \"../../how_to/state_of_the_union.txt\"}\n", - ")\n", - "\n", - "print(\"metadata :\", docs[0].metadata)\n", - "\n", - "# or you can directly supply SQL string filters :\n", - "\n", - "print(\"\\nSQL filtering :\\n\")\n", - "docs = docsearch.similarity_search(query=query, filter=\"text LIKE '%Officer Rivera%'\")\n", - "print(docs[0].page_content)" - ] - }, - { - "cell_type": "markdown", - "id": "9a173c94", - "metadata": {}, - "source": [ - "## Adding images " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05f669d7", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install -U langchain-experimental" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ed69810", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install open_clip_torch torch" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "2cacb5ee", - "metadata": {}, - "outputs": [], - "source": [ - "! rm -rf '/tmp/multimmodal_lance'" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "b3456e2c", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_experimental.open_clip import OpenCLIPEmbeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "3848eba2", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import requests\n", - "\n", - "# List of image URLs to download\n", - "image_urls = [\n", - " \"https://github.com/raghavdixit99/assets/assets/34462078/abf47cc4-d979-4aaa-83be-53a2115bf318\",\n", - " \"https://github.com/raghavdixit99/assets/assets/34462078/93be928e-522b-4e37-889d-d4efd54b2112\",\n", - "]\n", - "\n", - "texts = [\"bird\", \"dragon\"]\n", - "\n", - "# Directory to save images\n", - "dir_name = \"./photos/\"\n", - "\n", - "# Create directory if it doesn't exist\n", - "os.makedirs(dir_name, exist_ok=True)\n", - "\n", - "image_uris = []\n", - "# Download and save each image\n", - "for i, url in enumerate(image_urls, start=1):\n", - " response = requests.get(url)\n", - " path = os.path.join(dir_name, f\"image{i}.jpg\")\n", - " image_uris.append(path)\n", - " with open(path, \"wb\") as f:\n", - " f.write(response.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "3d62c2a0", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.vectorstores import LanceDB\n", - "\n", - "vec_store = LanceDB(\n", - " table_name=\"multimodal_test\",\n", - " embedding=OpenCLIPEmbeddings(),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ebbb4881", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['b673620b-01f0-42ca-a92e-d033bb92c0a6',\n", - " '99c3a5b0-b577-417a-8177-92f4a655dbfb']" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vec_store.add_images(uris=image_uris)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "3c29dea3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['f7adde5d-a4a3-402b-9e73-088b230722c3',\n", - " 'cbed59da-0aec-4bff-8820-9e59d81a2140']" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vec_store.add_texts(texts)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "8b2f25ce", - "metadata": {}, - "outputs": [], - "source": [ - "img_embed = vec_store._embedding.embed_query(\"bird\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "87a24079", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(page_content='bird', metadata={'id': 'f7adde5d-a4a3-402b-9e73-088b230722c3'})" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vec_store.similarity_search_by_vector(img_embed)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "78557867", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LanceTable(connection=LanceDBConnection(/tmp/lancedb), name=\"multimodal_test\")" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vec_store._table" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/multi_lingual_example.ipynb b/docs/src/notebooks/multi_lingual_example.ipynb deleted file mode 100644 index da4d59e8..00000000 --- a/docs/src/notebooks/multi_lingual_example.ipynb +++ /dev/null @@ -1,604 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Example - Multi-lingual semantic search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lancedb Embeddings API: Multi-lingual semantic search\n", - "In this example, we'll build a simple LanceDB table containing embeddings for different languages that can be used for universal semantic search.\n", - "* The **Dataset** used will be wikipedia dataset in English and French\n", - "* The **Model** used will be cohere's multi-lingual model\n", - "\n", - "In this example, we'll explore LanceDB's Embeddings API that allows you to create tables that automatically vectorize data once you define the config at the time of table creation. Let's dive right in!\n", - "\n", - "To learn more about LanceDB, visit [our docs](https://lancedb.github.io/lancedb/)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install -qU datasets cohere openai lancedb\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create datasets\n", - "For accessing the datasets, we'll use datasets library in streaming mode. We'll use english and french versions and embed them together. For semantic search the order should be irrelevant" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ayush/vectordb-recipes/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from datasets import load_dataset\n", - "\n", - "en = dataset = load_dataset(\"wikipedia\", \"20220301.en\", streaming=True,)\n", - "fr = load_dataset(\"wikipedia\", \"20220301.fr\", streaming=True)\n", - "\n", - "datasets = {\"english\": iter(en['train']), \"french\": iter(fr['train'])}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the dataset format" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '12',\n", - " 'url': 'https://en.wikipedia.org/wiki/Anarchism',\n", - " 'title': 'Anarchism',\n", - " 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\\n\\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in workers\\' struggles for emancipation. Various anarchist schools of thought formed during this period. Anarchists have taken part in several revolutions, most notably in the Paris Commune, the Russian Civil War and the Spanish Civil War, whose end marked the end of the classical era of anarchism. In the last decades of the 20th and into the 21st century, the anarchist movement has been resurgent once more.\\n\\nAnarchism employs a diversity of tactics in order to meet its ideal ends which can be broadly separated into revolutionary and evolutionary tactics; there is significant overlap between the two, which are merely descriptive. Revolutionary tactics aim to bring down authority and state, having taken a violent turn in the past, while evolutionary tactics aim to prefigure what an anarchist society would be like. Anarchist thought, criticism, and praxis have played a part in diverse areas of human society. Criticism of anarchism include claims that it is internally inconsistent, violent, or utopian.\\n\\nEtymology, terminology, and definition \\n\\nThe etymological origin of anarchism is from the Ancient Greek anarkhia, meaning \"without a ruler\", composed of the prefix an- (\"without\") and the word arkhos (\"leader\" or \"ruler\"). The suffix -ism denotes the ideological current that favours anarchy. Anarchism appears in English from 1642 as anarchisme and anarchy from 1539; early English usages emphasised a sense of disorder. Various factions within the French Revolution labelled their opponents as anarchists, although few such accused shared many views with later anarchists. Many revolutionaries of the 19th century such as William Godwin (1756–1836) and Wilhelm Weitling (1808–1871) would contribute to the anarchist doctrines of the next generation but did not use anarchist or anarchism in describing themselves or their beliefs.\\n\\nThe first political philosopher to call himself an anarchist () was Pierre-Joseph Proudhon (1809–1865), marking the formal birth of anarchism in the mid-19th century. Since the 1890s and beginning in France, libertarianism has often been used as a synonym for anarchism and its use as a synonym is still common outside the United States. Some usages of libertarianism refer to individualistic free-market philosophy only, and free-market anarchism in particular is termed libertarian anarchism.\\n\\nWhile the term libertarian has been largely synonymous with anarchism, its meaning has more recently diluted with wider adoption from ideologically disparate groups, including both the New Left and libertarian Marxists, who do not associate themselves with authoritarian socialists or a vanguard party, and extreme cultural liberals, who are primarily concerned with civil liberties. Additionally, some anarchists use libertarian socialist to avoid anarchism\\'s negative connotations and emphasise its connections with socialism. Anarchism is broadly used to describe the anti-authoritarian wing of the socialist movement. Anarchism is contrasted to socialist forms which are state-oriented or from above. Scholars of anarchism generally highlight anarchism\\'s socialist credentials and criticise attempts at creating dichotomies between the two. Some scholars describe anarchism as having many influences from liberalism, and being both liberals and socialists but more so, while most scholars reject anarcho-capitalism as a misunderstanding of anarchist principles.\\n\\nWhile opposition to the state is central to anarchist thought, defining anarchism is not an easy task for scholars, as there is a lot of discussion among scholars and anarchists on the matter, and various currents perceive anarchism slightly differently. Major definitional elements include the will for a non-coercive society, the rejection of the state apparatus, the belief that human nature allows humans to exist in or progress toward such a non-coercive society, and a suggestion on how to act to pursue the ideal of anarchy.\\n\\nHistory\\n\\nPre-modern era \\n\\nBefore the establishment of towns and cities, an established authority did not exist. It was after the creation of institutions of authority that anarchistic ideas espoused as a reaction. The most notable precursors to anarchism in the ancient world were in China and Greece. In China, philosophical anarchism (the discussion on the legitimacy of the state) was delineated by Taoist philosophers Zhuang Zhou and Laozi. Alongside Stoicism, Taoism has been said to have had \"significant anticipations\" of anarchism.\\n \\nAnarchic attitudes were also articulated by tragedians and philosophers in Greece. Aeschylus and Sophocles used the myth of Antigone to illustrate the conflict between rules set by the state and personal autonomy. Socrates questioned Athenian authorities constantly and insisted on the right of individual freedom of conscience. Cynics dismissed human law (nomos) and associated authorities while trying to live according to nature (physis). Stoics were supportive of a society based on unofficial and friendly relations among its citizens without the presence of a state.\\n\\nIn medieval Europe, there was no anarchistic activity except some ascetic religious movements. These, and other Muslim movements, later gave birth to religious anarchism. In the Sasanian Empire, Mazdak called for an egalitarian society and the abolition of monarchy, only to be soon executed by Emperor Kavad I.\\n\\nIn Basra, religious sects preached against the state. In Europe, various sects developed anti-state and libertarian tendencies. Renewed interest in antiquity during the Renaissance and in private judgment during the Reformation restored elements of anti-authoritarian secularism, particularly in France. Enlightenment challenges to intellectual authority (secular and religious) and the revolutions of the 1790s and 1848 all spurred the ideological development of what became the era of classical anarchism.\\n\\nModern era \\nDuring the French Revolution, partisan groups such as the Enragés and the saw a turning point in the fermentation of anti-state and federalist sentiments. The first anarchist currents developed throughout the 18th century as William Godwin espoused philosophical anarchism in England, morally delegitimising the state, Max Stirner\\'s thinking paved the way to individualism and Pierre-Joseph Proudhon\\'s theory of mutualism found fertile soil in France. By the late 1870s, various anarchist schools of thought had become well-defined and a wave of then unprecedented globalisation occurred from 1880 to 1914. This era of classical anarchism lasted until the end of the Spanish Civil War and is considered the golden age of anarchism.\\n\\nDrawing from mutualism, Mikhail Bakunin founded collectivist anarchism and entered the International Workingmen\\'s Association, a class worker union later known as the First International that formed in 1864 to unite diverse revolutionary currents. The International became a significant political force, with Karl Marx being a leading figure and a member of its General Council. Bakunin\\'s faction (the Jura Federation) and Proudhon\\'s followers (the mutualists) opposed state socialism, advocating political abstentionism and small property holdings. After bitter disputes, the Bakuninists were expelled from the International by the Marxists at the 1872 Hague Congress. Anarchists were treated similarly in the Second International, being ultimately expelled in 1896. Bakunin famously predicted that if revolutionaries gained power by Marx\\'s terms, they would end up the new tyrants of workers. In response to their expulsion from the First International, anarchists formed the St. Imier International. Under the influence of Peter Kropotkin, a Russian philosopher and scientist, anarcho-communism overlapped with collectivism. Anarcho-communists, who drew inspiration from the 1871 Paris Commune, advocated for free federation and for the distribution of goods according to one\\'s needs.\\n\\nAt the turn of the century, anarchism had spread all over the world. It was a notable feature of the international syndicalism movement. In China, small groups of students imported the humanistic pro-science version of anarcho-communism. Tokyo was a hotspot for rebellious youth from countries of the far east, travelling to the Japanese capital to study. In Latin America, Argentina was a stronghold for anarcho-syndicalism, where it became the most prominent left-wing ideology. During this time, a minority of anarchists adopted tactics of revolutionary political violence. This strategy became known as propaganda of the deed. The dismemberment of the French socialist movement into many groups and the execution and exile of many Communards to penal colonies following the suppression of the Paris Commune favoured individualist political expression and acts. Even though many anarchists distanced themselves from these terrorist acts, infamy came upon the movement and attempts were made to exclude them from American immigration, including the Immigration Act of 1903, also called the Anarchist Exclusion Act. Illegalism was another strategy which some anarchists adopted during this period.\\n\\nDespite concerns, anarchists enthusiastically participated in the Russian Revolution in opposition to the White movement; however, they met harsh suppression after the Bolshevik government was stabilised. Several anarchists from Petrograd and Moscow fled to Ukraine, notably leading to the Kronstadt rebellion and Nestor Makhno\\'s struggle in the Free Territory. With the anarchists being crushed in Russia, two new antithetical currents emerged, namely platformism and synthesis anarchism. The former sought to create a coherent group that would push for revolution while the latter were against anything that would resemble a political party. Seeing the victories of the Bolsheviks in the October Revolution and the resulting Russian Civil War, many workers and activists turned to communist parties which grew at the expense of anarchism and other socialist movements. In France and the United States, members of major syndicalist movements such as the General Confederation of Labour and the Industrial Workers of the World left their organisations and joined the Communist International.\\n\\nIn the Spanish Civil War of 1936, anarchists and syndicalists (CNT and FAI) once again allied themselves with various currents of leftists. A long tradition of Spanish anarchism led to anarchists playing a pivotal role in the war. In response to the army rebellion, an anarchist-inspired movement of peasants and workers, supported by armed militias, took control of Barcelona and of large areas of rural Spain, where they collectivised the land. The Soviet Union provided some limited assistance at the beginning of the war, but the result was a bitter fight among communists and anarchists at a series of events named May Days as Joseph Stalin tried to seize control of the Republicans.\\n\\nPost-war era \\n\\nAt the end of World War II, the anarchist movement was severely weakened. The 1960s witnessed a revival of anarchism, likely caused by a perceived failure of Marxism–Leninism and tensions built by the Cold War. During this time, anarchism found a presence in other movements critical towards both capitalism and the state such as the anti-nuclear, environmental, and peace movements, the counterculture of the 1960s, and the New Left. It also saw a transition from its previous revolutionary nature to provocative anti-capitalist reformism. Anarchism became associated with punk subculture as exemplified by bands such as Crass and the Sex Pistols. The established feminist tendencies of anarcha-feminism returned with vigour during the second wave of feminism. Black anarchism began to take form at this time and influenced anarchism\\'s move from a Eurocentric demographic. This coincided with its failure to gain traction in Northern Europe and its unprecedented height in Latin America.\\n\\nAround the turn of the 21st century, anarchism grew in popularity and influence within anti-capitalist, anti-war and anti-globalisation movements. Anarchists became known for their involvement in protests against the World Trade Organization (WTO), the Group of Eight and the World Economic Forum. During the protests, ad hoc leaderless anonymous cadres known as black blocs engaged in rioting, property destruction and violent confrontations with the police. Other organisational tactics pioneered in this time include affinity groups, security culture and the use of decentralised technologies such as the Internet. A significant event of this period was the confrontations at the 1999 Seattle WTO conference. Anarchist ideas have been influential in the development of the Zapatistas in Mexico and the Democratic Federation of Northern Syria, more commonly known as Rojava, a de facto autonomous region in northern Syria.\\n\\nThought \\n\\nAnarchist schools of thought have been generally grouped into two main historical traditions, social anarchism and individualist anarchism, owing to their different origins, values and evolution. The individualist current emphasises negative liberty in opposing restraints upon the free individual, while the social current emphasises positive liberty in aiming to achieve the free potential of society through equality and social ownership. In a chronological sense, anarchism can be segmented by the classical currents of the late 19th century and the post-classical currents (anarcha-feminism, green anarchism, and post-anarchism) developed thereafter.\\n\\nBeyond the specific factions of anarchist movements which constitute political anarchism lies philosophical anarchism which holds that the state lacks moral legitimacy, without necessarily accepting the imperative of revolution to eliminate it. A component especially of individualist anarchism, philosophical anarchism may tolerate the existence of a minimal state but claims that citizens have no moral obligation to obey government when it conflicts with individual autonomy. Anarchism pays significant attention to moral arguments since ethics have a central role in anarchist philosophy. Anarchism\\'s emphasis on anti-capitalism, egalitarianism, and for the extension of community and individuality sets it apart from anarcho-capitalism and other types of economic libertarianism.\\n\\nAnarchism is usually placed on the far-left of the political spectrum. Much of its economics and legal philosophy reflect anti-authoritarian, anti-statist, libertarian, and radical interpretations of left-wing and socialist politics such as collectivism, communism, individualism, mutualism, and syndicalism, among other libertarian socialist economic theories. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist and varieties of anarchy diverge widely. One reaction against sectarianism within the anarchist milieu was anarchism without adjectives, a call for toleration and unity among anarchists first adopted by Fernando Tarrida del Mármol in 1889 in response to the bitter debates of anarchist theory at the time. Belief in political nihilism has been espoused by anarchists. Despite separation, the various anarchist schools of thought are not seen as distinct entities but rather as tendencies that intermingle and are connected through a set of uniform principles such as individual and local autonomy, mutual aid, network organisation, communal democracy, justified authority and decentralisation.\\n\\nClassical \\n\\nInceptive currents among classical anarchist currents were mutualism and individualism. They were followed by the major currents of social anarchism (collectivist, communist and syndicalist). They differ on organisational and economic aspects of their ideal society.\\n\\nMutualism is an 18th-century economic theory that was developed into anarchist theory by Pierre-Joseph Proudhon. Its aims include reciprocity, free association, voluntary contract, federation and monetary reform of both credit and currency that would be regulated by a bank of the people. Mutualism has been retrospectively characterised as ideologically situated between individualist and collectivist forms of anarchism. In What Is Property? (1840), Proudhon first characterised his goal as a \"third form of society, the synthesis of communism and property.\" Collectivist anarchism is a revolutionary socialist form of anarchism commonly associated with Mikhail Bakunin. Collectivist anarchists advocate collective ownership of the means of production which is theorised to be achieved through violent revolution and that workers be paid according to time worked, rather than goods being distributed according to need as in communism. Collectivist anarchism arose alongside Marxism but rejected the dictatorship of the proletariat despite the stated Marxist goal of a collectivist stateless society.\\n\\nAnarcho-communism is a theory of anarchism that advocates a communist society with common ownership of the means of production, direct democracy and a horizontal network of voluntary associations, workers\\' councils and worker cooperatives, with production and consumption based on the guiding principle \"From each according to his ability, to each according to his need.\" Anarcho-communism developed from radical socialist currents after the French Revolution but was first formulated as such in the Italian section of the First International. It was later expanded upon in the theoretical work of Peter Kropotkin, whose specific style would go onto become the dominating view of anarchists by the late 19th century. Anarcho-syndicalism is a branch of anarchism that views labour syndicates as a potential force for revolutionary social change, replacing capitalism and the state with a new society democratically self-managed by workers. The basic principles of anarcho-syndicalism are direct action, workers\\' solidarity and workers\\' self-management.\\n\\nIndividualist anarchism is a set of several traditions of thought within the anarchist movement that emphasise the individual and their will over any kinds of external determinants. Early influences on individualist forms of anarchism include William Godwin, Max Stirner, and Henry David Thoreau. Through many countries, individualist anarchism attracted a small yet diverse following of Bohemian artists and intellectuals as well as young anarchist outlaws in what became known as illegalism and individual reclamation.\\n\\nPost-classical and contemporary \\n\\nAnarchist principles undergird contemporary radical social movements of the left. Interest in the anarchist movement developed alongside momentum in the anti-globalisation movement, whose leading activist networks were anarchist in orientation. As the movement shaped 21st century radicalism, wider embrace of anarchist principles signaled a revival of interest. Anarchism has continued to generate many philosophies and movements, at times eclectic, drawing upon various sources and combining disparate concepts to create new philosophical approaches. The anti-capitalist tradition of classical anarchism has remained prominent within contemporary currents.\\n\\nContemporary news coverage which emphasizes black bloc demonstrations has reinforced anarchism\\'s historical association with chaos and violence. Its publicity has also led more scholars in fields such as anthropology and history to engage with the anarchist movement, although contemporary anarchism favours actions over academic theory. Various anarchist groups, tendencies, and schools of thought exist today, making it difficult to describe the contemporary anarchist movement. While theorists and activists have established \"relatively stable constellations of anarchist principles\", there is no consensus on which principles are core and commentators describe multiple anarchisms, rather than a singular anarchism, in which common principles are shared between schools of anarchism while each group prioritizes those principles differently. Gender equality can be a common principle, although it ranks as a higher priority to anarcha-feminists than anarcho-communists.\\n\\nAnarchists are generally committed against coercive authority in all forms, namely \"all centralized and hierarchical forms of government (e.g., monarchy, representative democracy, state socialism, etc.), economic class systems (e.g., capitalism, Bolshevism, feudalism, slavery, etc.), autocratic religions (e.g., fundamentalist Islam, Roman Catholicism, etc.), patriarchy, heterosexism, white supremacy, and imperialism.\" Anarchist schools disagree on the methods by which these forms should be opposed. The principle of equal liberty is closer to anarchist political ethics in that it transcends both the liberal and socialist traditions. This entails that liberty and equality cannot be implemented within the state, resulting in the questioning of all forms of domination and hierarchy.\\n\\nTactics \\nAnarchists\\' tactics take various forms but in general serve two major goals, namely to first oppose the Establishment and secondly to promote anarchist ethics and reflect an anarchist vision of society, illustrating the unity of means and ends. A broad categorisation can be made between aims to destroy oppressive states and institutions by revolutionary means on one hand and aims to change society through evolutionary means on the other. Evolutionary tactics embrace nonviolence, reject violence and take a gradual approach to anarchist aims, although there is significant overlap between the two.\\n\\nAnarchist tactics have shifted during the course of the last century. Anarchists during the early 20th century focused more on strikes and militancy while contemporary anarchists use a broader array of approaches.\\n\\nClassical era tactics \\n\\nDuring the classical era, anarchists had a militant tendency. Not only did they confront state armed forces, as in Spain and Ukraine, but some of them also employed terrorism as propaganda of the deed. Assassination attempts were carried out against heads of state, some of which were successful. Anarchists also took part in revolutions. Many anarchists, especially the Galleanists, believed that these attempts would be the impetus for a revolution against capitalism and the state. Many of these attacks were done by individual assailants and the majority took place in the late 1870s, the early 1880s and the 1890s, with some still occurring in the early 1900s. Their decrease in prevalence was the result of further judicial power and targeting and cataloging by state institutions.\\n\\nAnarchist perspectives towards violence have always been controversial. Anarcho-pacifists advocate for non-violence means to achieve their stateless, nonviolent ends. Other anarchist groups advocate direct action, a tactic which can include acts of sabotage or terrorism. This attitude was quite prominent a century ago when seeing the state as a tyrant and some anarchists believing that they had every right to oppose its oppression by any means possible. Emma Goldman and Errico Malatesta, who were proponents of limited use of violence, stated that violence is merely a reaction to state violence as a necessary evil.\\n\\nAnarchists took an active role in strike actions, although they tended to be antipathetic to formal syndicalism, seeing it as reformist. They saw it as a part of the movement which sought to overthrow the state and capitalism. Anarchists also reinforced their propaganda within the arts, some of whom practiced naturism and nudism. Those anarchists also built communities which were based on friendship and were involved in the news media.\\n\\nRevolutionary tactics \\n\\nIn the current era, Italian anarchist Alfredo Bonanno, a proponent of insurrectionary anarchism, has reinstated the debate on violence by rejecting the nonviolence tactic adopted since the late 19th century by Kropotkin and other prominent anarchists afterwards. Both Bonanno and the French group The Invisible Committee advocate for small, informal affiliation groups, where each member is responsible for their own actions but works together to bring down oppression utilizing sabotage and other violent means against state, capitalism, and other enemies. Members of The Invisible Committee were arrested in 2008 on various charges, terrorism included.\\n\\nOverall, contemporary anarchists are much less violent and militant than their ideological ancestors. They mostly engage in confronting the police during demonstrations and riots, especially in countries such as Canada, Greece, and Mexico. Militant black bloc protest groups are known for clashing with the police; however, anarchists not only clash with state operators, they also engage in the struggle against fascists and racists, taking anti-fascist action and mobilizing to prevent hate rallies from happening.\\n\\nEvolutionary tactics \\nAnarchists commonly employ direct action. This can take the form of disrupting and protesting against unjust hierarchy, or the form of self-managing their lives through the creation of counter-institutions such as communes and non-hierarchical collectives. Decision-making is often handled in an anti-authoritarian way, with everyone having equal say in each decision, an approach known as horizontalism. Contemporary-era anarchists have been engaging with various grassroots movements that are more or less based on horizontalism, although not explicitly anarchist, respecting personal autonomy and participating in mass activism such as strikes and demonstrations. In contrast with the big-A anarchism of the classical era, the newly coined term small-a anarchism signals their tendency not to base their thoughts and actions on classical-era anarchism or to refer to classical anarchists such as Peter Kropotkin and Pierre-Joseph Proudhon to justify their opinions. Those anarchists would rather base their thought and praxis on their own experience which they will later theorize.\\n\\nThe decision-making process of small anarchist affinity groups plays a significant tactical role. Anarchists have employed various methods in order to build a rough consensus among members of their group without the need of a leader or a leading group. One way is for an individual from the group to play the role of facilitator to help achieve a consensus without taking part in the discussion themselves or promoting a specific point. Minorities usually accept rough consensus, except when they feel the proposal contradicts anarchist ethics, goals and values. Anarchists usually form small groups (5–20 individuals) to enhance autonomy and friendships among their members. These kinds of groups more often than not interconnect with each other, forming larger networks. Anarchists still support and participate in strikes, especially wildcat strikes as these are leaderless strikes not organised centrally by a syndicate.\\n\\nAs in the past, newspapers and journals are used, and anarchists have gone online in the World Wide Web to spread their message. Anarchists have found it easier to create websites because of distributional and other difficulties, hosting electronic libraries and other portals. Anarchists were also involved in developing various software that are available for free. The way these hacktivists work to develop and distribute resembles the anarchist ideals, especially when it comes to preserving users\\' privacy from state surveillance.\\n\\nAnarchists organize themselves to squat and reclaim public spaces. During important events such as protests and when spaces are being occupied, they are often called Temporary Autonomous Zones (TAZ), spaces where art, poetry, and surrealism are blended to display the anarchist ideal. As seen by anarchists, squatting is a way to regain urban space from the capitalist market, serving pragmatical needs and also being an exemplary direct action. Acquiring space enables anarchists to experiment with their ideas and build social bonds. Adding up these tactics while having in mind that not all anarchists share the same attitudes towards them, along with various forms of protesting at highly symbolic events, make up a carnivalesque atmosphere that is part of contemporary anarchist vividity.\\n\\nKey issues \\n\\nAs anarchism is a philosophy that embodies many diverse attitudes, tendencies, and schools of thought; disagreement over questions of values, ideology, and tactics is common. Its diversity has led to widely different uses of identical terms among different anarchist traditions which has created a number of definitional concerns in anarchist theory. The compatibility of capitalism, nationalism, and religion with anarchism is widely disputed, and anarchism enjoys complex relationships with ideologies such as communism, collectivism, Marxism, and trade unionism. Anarchists may be motivated by humanism, divine authority, enlightened self-interest, veganism, or any number of alternative ethical doctrines. Phenomena such as civilisation, technology (e.g. within anarcho-primitivism), and the democratic process may be sharply criticised within some anarchist tendencies and simultaneously lauded in others.\\n\\nGender, sexuality, and free love \\n\\nAs gender and sexuality carry along them dynamics of hierarchy, many anarchists address, analyse, and oppose the suppression of one\\'s autonomy imposed by gender roles.\\n\\nSexuality was not often discussed by classical anarchists but the few that did felt that an anarchist society would lead to sexuality naturally developing. Sexual violence was a concern for anarchists such as Benjamin Tucker, who opposed age of consent laws, believing they would benefit predatory men. A historical current that arose and flourished during 1890 and 1920 within anarchism was free love. In contemporary anarchism, this current survives as a tendency to support polyamory and queer anarchism. Free love advocates were against marriage, which they saw as a way of men imposing authority over women, largely because marriage law greatly favoured the power of men. The notion of free love was much broader and included a critique of the established order that limited women\\'s sexual freedom and pleasure. Those free love movements contributed to the establishment of communal houses, where large groups of travelers, anarchists and other activists slept in beds together. Free love had roots both in Europe and the United States; however, some anarchists struggled with the jealousy that arose from free love. Anarchist feminists were advocates of free love, against marriage, and pro-choice (utilising a contemporary term), and had a similar agenda. Anarchist and non-anarchist feminists differed on suffrage but were supportive of one another.\\n\\nDuring the second half of the 20th century, anarchism intermingled with the second wave of feminism, radicalising some currents of the feminist movement and being influenced as well. By the latest decades of the 20th century, anarchists and feminists were advocating for the rights and autonomy of women, gays, queers and other marginalised groups, with some feminist thinkers suggesting a fusion of the two currents. With the third wave of feminism, sexual identity and compulsory heterosexuality became a subject of study for anarchists, yielding a post-structuralist critique of sexual normality. Some anarchists distanced themselves from this line of thinking, suggesting that it leaned towards an individualism that was dropping the cause of social liberation.\\n\\nAnarchism and education \\n\\nThe interest of anarchists in education stretches back to the first emergence of classical anarchism. Anarchists consider proper education, one which sets the foundations of the future autonomy of the individual and the society, to be an act of mutual aid. Anarchist writers such as William Godwin (Political Justice) and Max Stirner (\"The False Principle of Our Education\") attacked both state education and private education as another means by which the ruling class replicate their privileges.\\n\\nIn 1901, Catalan anarchist and free thinker Francisco Ferrer established the Escuela Moderna in Barcelona as an opposition to the established education system which was dictated largely by the Catholic Church. Ferrer\\'s approach was secular, rejecting both state and church involvement in the educational process whilst giving pupils large amounts of autonomy in planning their work and attendance. Ferrer aimed to educate the working class and explicitly sought to foster class consciousness among students. The school closed after constant harassment by the state and Ferrer was later arrested. Nonetheless, his ideas formed the inspiration for a series of modern schools around the world. Christian anarchist Leo Tolstoy, who published the essay Education and Culture, also established a similar school with its founding principle being that \"for education to be effective it had to be free.\" In a similar token, A. S. Neill founded what became the Summerhill School in 1921, also declaring being free from coercion.\\n\\nAnarchist education is based largely on the idea that a child\\'s right to develop freely and without manipulation ought to be respected and that rationality would lead children to morally good conclusions; however, there has been little consensus among anarchist figures as to what constitutes manipulation. Ferrer believed that moral indoctrination was necessary and explicitly taught pupils that equality, liberty and social justice were not possible under capitalism, along with other critiques of government and nationalism.\\n\\nLate 20th century and contemporary anarchist writers (Paul Goodman, Herbert Read, and Colin Ward) intensified and expanded the anarchist critique of state education, largely focusing on the need for a system that focuses on children\\'s creativity rather than on their ability to attain a career or participate in consumerism as part of a consumer society. Contemporary anarchists such as Ward claim that state education serves to perpetuate socioeconomic inequality.\\n\\nWhile few anarchist education institutions have survived to the modern-day, major tenets of anarchist schools, among them respect for child autonomy and relying on reasoning rather than indoctrination as a teaching method, have spread among mainstream educational institutions. Judith Suissa names three schools as explicitly anarchists schools, namely the Free Skool Santa Cruz in the United States which is part of a wider American-Canadian network of schools, the Self-Managed Learning College in Brighton, England, and the Paideia School in Spain.\\n\\nAnarchism and the state \\n\\nObjection to the state and its institutions is a sine qua non of anarchism. Anarchists consider the state as a tool of domination and believe it to be illegitimate regardless of its political tendencies. Instead of people being able to control the aspects of their life, major decisions are taken by a small elite. Authority ultimately rests solely on power, regardless of whether that power is open or transparent, as it still has the ability to coerce people. Another anarchist argument against states is that the people constituting a government, even the most altruistic among officials, will unavoidably seek to gain more power, leading to corruption. Anarchists consider the idea that the state is the collective will of the people to be an unachievable fiction due to the fact that the ruling class is distinct from the rest of society.\\n\\nSpecific anarchist attitudes towards the state vary. Robert Paul Wolff believed that the tension between authority and autonomy would mean the state could never be legitimate. Bakunin saw the state as meaning \"coercion, domination by means of coercion, camouflaged if possible but unceremonious and overt if need be.\" A. John Simmons and Leslie Green, who leaned toward philosophical anarchism, believed that the state could be legitimate if it is governed by consensus, although they saw this as highly unlikely. Beliefs on how to abolish the state also differ.\\n\\nAnarchism and the arts \\n\\nThe connection between anarchism and art was quite profound during the classical era of anarchism, especially among artistic currents that were developing during that era such as futurists, surrealists and others. In literature, anarchism was mostly associated with the New Apocalyptics and the neo-romanticism movement. In music, anarchism has been associated with music scenes such as punk. Anarchists such as Leo Tolstoy and Herbert Read stated that the border between the artist and the non-artist, what separates art from a daily act, is a construct produced by the alienation caused by capitalism and it prevents humans from living a joyful life.\\n\\nOther anarchists advocated for or used art as a means to achieve anarchist ends. In his book Breaking the Spell: A History of Anarchist Filmmakers, Videotape Guerrillas, and Digital Ninjas, Chris Robé claims that \"anarchist-inflected practices have increasingly structured movement-based video activism.\" Throughout the 20th century, many prominent anarchists (Peter Kropotkin, Emma Goldman, Gustav Landauer and Camillo Berneri) and publications such as Anarchy wrote about matters pertaining to the arts.\\n\\nThree overlapping properties made art useful to anarchists. It could depict a critique of existing society and hierarchies, serve as a prefigurative tool to reflect the anarchist ideal society and even turn into a means of direct action such as in protests. As it appeals to both emotion and reason, art could appeal to the whole human and have a powerful effect. The 19th-century neo-impressionist movement had an ecological aesthetic and offered an example of an anarchist perception of the road towards socialism. In Les chataigniers a Osny by anarchist painter Camille Pissarro, the blending of aesthetic and social harmony is prefiguring an ideal anarchistic agrarian community.\\n\\nAnalysis \\nThe most common critique of anarchism is that humans cannot self-govern and so a state is necessary for human survival. Philosopher Bertrand Russell supported this critique, stating that \"[p]eace and war, tariffs, regulations of sanitary conditions and the sale of noxious drugs, the preservation of a just system of distribution: these, among others, are functions which could hardly be performed in a community in which there was no central government.\" Another common criticism of anarchism is that it fits a world of isolation in which only the small enough entities can be self-governing; a response would be that major anarchist thinkers advocated anarchist federalism.\\n\\nPhilosophy lecturer Andrew G. Fiala composed a list of common arguments against anarchism which includes critiques such as that anarchism is innately related to violence and destruction, not only in the pragmatic world, such as at protests, but in the world of ethics as well. Secondly, anarchism is evaluated as unfeasible or utopian since the state cannot be defeated practically. This line of arguments most often calls for political action within the system to reform it. The third argument is that anarchism is self-contradictory. While it advocates for no-one to archiei, if accepted by the many, then anarchism would turn into the ruling political theory. In this line of criticism also comes the self-contradiction that anarchism calls for collective action whilst endorsing the autonomy of the individual, hence no collective action can be taken. Lastly, Fiala mentions a critique towards philosophical anarchism of being ineffective (all talk and thoughts) and in the meantime capitalism and bourgeois class remains strong.\\n\\nPhilosophical anarchism has met the criticism of members of academia following the release of pro-anarchist books such as A. John Simmons\\' Moral Principles and Political Obligations. Law professor William A. Edmundson authored an essay to argue against three major philosophical anarchist principles which he finds fallacious. Edmundson says that while the individual does not owe the state a duty of obedience, this does not imply that anarchism is the inevitable conclusion and the state is still morally legitimate. In The Problem of Political Authority, Michael Huemer defends philosophical anarchism, claiming that \"political authority is a moral illusion.\"\\n\\nOne of the earliest criticisms is that anarchism defies and fails to understand the biological inclination to authority. Joseph Raz states that the acceptance of authority implies the belief that following their instructions will afford more success. Raz believes that this argument is true in following both authorities\\' successful and mistaken instruction. Anarchists reject this criticism because challenging or disobeying authority does not entail the disappearance of its advantages by acknowledging authority such as doctors or lawyers as reliable, nor does it involve a complete surrender of independent judgment. Anarchist perception of human nature, rejection of the state, and commitment to social revolution has been criticised by academics as naive, overly simplistic, and unrealistic, respectively. Classical anarchism has been criticised for relying too heavily on the belief that the abolition of the state will lead to human cooperation prospering.\\n\\nFriedrich Engels, considered to be one of the principal founders of Marxism, criticised anarchism\\'s anti-authoritarianism as inherently counter-revolutionary because in his view a revolution is by itself authoritarian. Academic John Molyneux writes in his book Anarchism: A Marxist Criticism that \"anarchism cannot win\", believing that it lacks the ability to properly implement its ideas. The Marxist criticism of anarchism is that it has a utopian character because all individuals should have anarchist views and values. According to the Marxist view, that a social idea would follow directly from this human ideal and out of the free will of every individual formed its essence. Marxists state that this contradiction was responsible for their inability to act. In the anarchist vision, the conflict between liberty and equality was resolved through coexistence and intertwining.\\n\\nSee also \\n\\n Anarchism by country\\n Governance without government\\n List of anarchist political ideologies\\n List of books about anarchism\\n\\nReferences\\n\\nCitations\\n\\nNotes\\n\\nSources\\n\\nPrimary sources\\n\\nSecondary sources\\n\\nTertiary sources\\n\\nFurther reading \\n \\n Criticism of philosophical anarchism.\\n \\n A defence of philosophical anarchism, stating that \"both kinds of \\'anarchism\\' [i.e. philosophical and political anarchism] are philosophical and political claims.\" (p.\\xa0137)\\n Anarchistic popular fiction novel.\\n \\n \\n \\n An argument for philosophical anarchism.\\n\\nExternal links \\n Anarchy Archives. Anarchy Archives is an online research center on the history and theory of anarchism.\\n\\n \\nAnti-capitalism\\nAnti-fascism\\nEconomic ideologies\\nLeft-wing politics\\nLibertarian socialism\\nLibertarianism\\nPolitical culture\\nPolitical movements\\nPolitical ideologies\\nSocial theories\\nSocialism\\nFar-left politics'}" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "next(iter(en['train']))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '3',\n", - " 'url': 'https://fr.wikipedia.org/wiki/Antoine%20Meillet',\n", - " 'title': 'Antoine Meillet',\n", - " 'text': \"Paul Jules Antoine Meillet, né le à Moulins (Allier) et mort le à Châteaumeillant (Cher), est le principal linguiste français des premières décennies du . Il est aussi philologue.\\n\\nBiographie \\nD'origine bourbonnaise, fils d'un notaire de Châteaumeillant (Cher), Antoine Meillet fait ses études secondaires au lycée de Moulins.\\n\\nÉtudiant à la faculté des lettres de Paris à partir de 1885 où il suit notamment les cours de Louis Havet, il assiste également à ceux de Michel Bréal au Collège de France et de Ferdinand de Saussure à l'École pratique des hautes études.\\n\\nEn 1889, il est major de l'agrégation de grammaire.\\n\\nIl assure à la suite de Saussure le cours de grammaire comparée, qu'il complète à partir de 1894 par une conférence sur les langues persanes.\\n\\nEn 1897, il soutient sa thèse pour le doctorat ès lettres (Recherches sur l'emploi du génitif-accusatif en vieux-slave). En 1905, il occupe la chaire de grammaire comparée au Collège de France, où il consacre ses cours à l'histoire et à la structure des langues indo-européennes. Il succéda au linguiste Auguste Carrière à la tête de la chaire d'arménien à l'École des langues orientales.\\n\\nSecrétaire de la Société de linguistique de Paris, il est élu à l'Académie des inscriptions et belles-lettres en 1924. Il préside également l'Institut d'Études Slaves de 1921 à sa mort.\\n\\nIl a formé toute une génération de linguistes français, parmi lesquels Émile Benveniste, Marcel Cohen, Georges Dumézil, André Martinet, Aurélien Sauvageot, Lucien Tesnière, Joseph Vendryes, ainsi que le japonisant Charles Haguenauer. Antoine Meillet devait diriger la thèse de Jean Paulhan sur la sémantique du proverbe et c'est lui qui découvrit Gustave Guillaume.\\n\\nIl a influencé aussi un certain nombre de linguistes étrangers. Il a également été le premier à identifier le phénomène de la grammaticalisation.\\n\\nSelon le linguiste allemand Walter Porzig, Meillet est un « grand précurseur ». Il montre, par exemple, que, dans les dialectes indo-européens, les groupes indo-européens sont le résultat historique d'une variation diatopique.\\n\\nL’acte de naissance de la sociolinguistique est signé par Antoine Meillet fondateur de la sociolinguistique qui s’est opposé au Cours de linguistique générale de Ferdinand de Saussure dès son apparition en 1916 en le critiquant sur plusieurs plans.\\n\\nÉtudes arméniennes \\n 1890 : une mission de trois mois dans le Caucase lui permet d'apprendre l'arménien moderne.\\n 1902 : il obtient la chaire d'arménien de l'École des langues orientales.\\n 1903 : nouvelle mission en Arménie russe, il publie son Esquisse d'une grammaire comparée de l'arménien classique, qui demeure une référence en linguistique arménienne et indo-européenne jusqu'à ce jour. L'un de ses étudiants, Hratchia Adjarian, devient le fondateur de la dialectologie arménienne. C'est également sous les encouragements de Meillet qu'Émile Benveniste étudie la langue arménienne.\\n 1919 : il est cofondateur de la Société des études arméniennes avec Victor Bérard, Charles Diehl, André-Ferdinand Hérold, H. Lacroix, Frédéric Macler, Gabriel Millet, Gustave Schlumberger.\\n 1920 : le , il crée la Revue des études arméniennes avec Frédéric Macler.\\n\\nÉtudes homériques \\nÀ la Sorbonne, Meillet supervise le travail de Milman Parry. Meillet offre à son étudiant l'opinion, nouvelle à cette époque, que la structure formulaïque de l'Iliade serait une conséquence directe de sa transmission orale. Ainsi, il le dirige vers l'étude de l'oralité dans son cadre natif et lui suggère d'observer les mécanismes d'une tradition orale vivante à côté du texte classique (l'Iliade) qui est censé résulter d'une telle tradition. En conséquence, Meillet présente Parry à Matija Murko, savant originaire de Slovénie qui avait longuement écrit sur la tradition héroïque épique dans les Balkans, surtout en Bosnie-Herzégovine. Par leurs recherches, dont les résultats sont à présent hébergés par l'université de Harvard, Parry et son élève, Albert Lord, ont profondément renouvelé les études homériques.\\n\\nPrincipaux ouvrages \\n Études sur l'étymologie et le vocabulaire du vieux slave. Paris, Bouillon, 1902-05.\\n Esquisse d'une grammaire comparée de l'arménien classique, 1903.\\n Introduction à l'étude comparative des langues indo-européennes, 1903 ( éd.), Hachette, Paris, 1912 ( éd.).\\n Les dialectes indo-européens, 1908.\\n Aperçu d'une histoire de la langue grecque, 1913.\\n Altarmenisches Elementarbuch, 1913. Heidelberg (en français : Manuel élémentaire d'Arménien classique, traduction de Gabriel Képéklian, Limoges, Lambert-Lucas, 2017 )\\n Caractères généraux des langues germaniques, 1917, rev. edn. 1949.\\n Linguistique historique et linguistique générale, 1921 (le tome II est paru en 1936 ; les deux tomes ont été réunis chez Lambert-Lucas, Limoges, 2015).\\n Les origines indo-européennes des mètres grecs, 1923.\\n Traité de grammaire comparée des langues classiques, 1924 (avec Joseph Vendryés).\\n La méthode comparative en linguistique historique, 1925, Oslo, Instituttet for Sammenlignende Kulturforskning (réimpr. Paris, Champion, 1954).\\n .\\n Dictionnaire étymologique de la langue latine, 1932 (en collab. Avec Alfred Ernout (1879-1973), éd. augmentée, par Jacques André (1910-1994), Paris : Klincksieck, 2001, \\n Meillet en Arménie, 1891, 1903, Journaux et lettres publiés par Francis Gandon, Limoges, Lambert-Lucas, 2014, .\\n\\nNotes et références\\n\\nVoir aussi\\n\\nBibliographie \\n Marc Décimo, Sciences et pataphysique, t. 2 : Comment la linguistique vint à Paris ?, De Michel Bréal à Ferdinand de Saussure, Dijon, Les Presses du réel, coll. Les Hétéroclites, 2014 .\\n\\nArticles connexes \\n Franz Bopp\\n Johann Kaspar Zeuss\\n\\nLiens externes \\n \\n \\n \\n\\nCommandeur de la Légion d'honneur\\nAcadémie des inscriptions et belles-lettres\\nAgrégé de grammaire\\nLinguiste français\\nPhilologue français\\nSlaviste\\nPersonnalité liée à la langue kurde\\nInstitut national des langues et civilisations orientales\\nArménologue français\\nIndo-européaniste\\nÉtudiant de l'université de Paris\\nNaissance en novembre 1866\\nNaissance à Moulins (Allier)\\nDécès en septembre 1936\\nDécès à 69 ans\\nDécès dans le Cher\\nPersonnalité inhumée à Moulins\"}" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "next(iter(fr['train']))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LanceDB Embeddings API\n", - "Let's see how you can use the embeddings API to create an ingestion pipeline that automatically does all the vectorization for you both when ingesting new data or searching queries." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### OpenAI API Example\n", - "Let us take a look at openAI example first. LanceDB comes with OpenAI embedding function support.\n", - "* Create the instance of the available embedding function or create your own\n", - "* Create the scheme of the table, marking source end vector fields. Each embedding function can have multiple source and vector feilds\n", - "* Create a table with schema\n", - "\n", - "Doing this creates a table with where embedding function information is ingested as metadata so you can forget about all the modelling details and focus only ingesting and retrieving data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import lancedb\n", - "import getpass\n", - "from lancedb.embeddings import EmbeddingFunctionRegistry\n", - "from lancedb.pydantic import LanceModel, Vector\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ['OPENAI_API_KEY'] = getpass.getpass(\"Enter your OpenAI API key: \")\n", - " \n", - "registry = EmbeddingFunctionRegistry().get_instance()\n", - "openai = registry.get(\"openai\").create() # uses multi-lingual model by default (768 dim)\n", - "\n", - "class Schema(LanceModel):\n", - " vector: Vector(openai.ndims()) = openai.VectorField()\n", - " text: str = openai.SourceField()\n", - " url: str\n", - " title: str\n", - " id: str\n", - " lang: str\n", - "\n", - "db = lancedb.connect(\"~/lancedb\")\n", - "tbl_openai = db.create_table(\"wikipedia-openai\", schema=Schema, mode=\"overwrite\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cohere Embedding Table\n", - "Now let's see another example using cohere embedding function which is also supported directly by LanceDB. We will follow the same steps." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import lancedb\n", - "import getpass\n", - "from lancedb.embeddings import EmbeddingFunctionRegistry\n", - "from lancedb.pydantic import LanceModel, Vector\n", - "\n", - "if \"COHERE_API_KEY\" not in os.environ:\n", - " os.environ['COHERE_API_KEY'] = getpass.getpass(\"Enter your Cohere API key: \")\n", - " \n", - "registry = EmbeddingFunctionRegistry().get_instance()\n", - "cohere = registry.get(\"cohere\").create() # uses multi-lingual model by default (768 dim)\n", - "\n", - "class Schema(LanceModel):\n", - " vector: Vector(cohere.ndims()) = cohere.VectorField()\n", - " text: str = cohere.SourceField()\n", - " url: str\n", - " title: str\n", - " id: str\n", - " lang: str\n", - "\n", - "db = lancedb.connect(\"~/lancedb\")\n", - "tbl_cohere = db.create_table(\"wikipedia-cohere\", schema=Schema, mode=\"overwrite\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ingest data\n", - "Now, we have the table set up for ingesting the dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 10/10 [06:10<00:00, 37.08s/it]\n" - ] - } - ], - "source": [ - "from tqdm.auto import tqdm\n", - "import time\n", - "# let's use cohere embeddings. Use can also set it to openai version of the table\n", - "tbl = tbl_cohere\n", - "batch_size = 1000\n", - "num_records = 10000\n", - "data = []\n", - "\n", - "for i in tqdm(range(0, num_records, batch_size)):\n", - "\n", - " for lang, dataset in datasets.items():\n", - " \n", - " batch = [next(dataset) for _ in range(batch_size)]\n", - " \n", - " texts = [x['text'] for x in batch]\n", - " ids = [f\"{x['id']}-{lang}\" for x in batch]\n", - " data.extend({\n", - " 'text': x['text'], 'title': x['title'], 'url': x['url'], 'lang': lang, 'id': f\"{lang}-{x['id']}\"\n", - " } for x in batch)\n", - "\n", - " # add in batches to avoid token limit\n", - " tbl.add(data)\n", - " data = []\n", - " time.sleep(20) # wait for 20 seconds to avoid rate limit" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Searching multi-lingual embedding space\n", - "Let us now search the table with a substring from a random batch in french" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '12',\n", - " 'url': 'https://fr.wikipedia.org/wiki/Arm%C3%A9e%20r%C3%A9publicaine%20irlandaise',\n", - " 'title': 'Armée républicaine irlandaise',\n", - " 'text': \"L'Armée républicaine irlandaise (, IRA ; ) est le nom porté, depuis le début du , par plusieurs organisations paramilitaires luttant par les armes contre la présence britannique en Irlande du Nord. Les différents groupes se référent à eux comme Óglaigh na hÉireann (« volontaires d'Irlande »).\\n\\n L' appelée aussi Old IRA, issue de l'union en 1916 entre l' (proche du Parti travailliste irlandais) et les Irish Volunteers (alors généralement proches de l'IRB), est active entre et , pendant la guerre d'indépendance irlandaise. Si ceux qui ont accepté le traité anglo-irlandais forment les Forces de Défense irlandaises, une partie de l'organisation, refusant cet accord, se constitue en une nouvelle Irish Republican Army, illégale.\\n L'Irish Republican Army anti-traité apparaît entre avril et du fait du refus du traité anglo-irlandais par une partie de l'Old IRA. Elle participe ainsi à la guerre civile irlandaise de à . Elle maintient son activité dans les deux Irlandes (État libre d'Irlande, indépendant, et Irlande du Nord, britannique), mais concentre son action sur les intérêts britanniques, surtout en Irlande du Nord. En 1969 l'organisation se divise, donnant naissance à lOfficial Irish Republican Army et à la Provisional Irish Republican Army, minoritaire, moins socialiste et plus activiste.\\n LOfficial Irish Republican Army, proche de l'''Official Sinn Féin, plus socialiste et moins nationaliste que la Provisional Irish Republican Army, mène des campagnes d'attentats principalement entre 1969 et 1972 durant le conflit nord-irlandais, avant de décréter un cessez-le-feu.\\n La Provisional Irish Republican Army, minoritaire après la scission de 1969 (d'où son nom de provisional, «\\xa0provisoire\\xa0») devient rapidement grâce à son militantisme la principale organisation armée républicaine du conflit nord-irlandais. Le terme de provisional est d'ailleurs abandonné vers la fin des années 1970. Elle fut active de 1969 à 1997 (date du cessez-le-feu définitif), puis déposa définitivement les armes en 2005. Refusant le processus de paix, deux organisations scissionnèrent d'avec la PIRA : la Real Irish Republican Army et la Continuity Irish Republican Army.\\n La Continuity Irish Republican Army est issue d'une scission d'avec la Provisional Irish Republican Army dès 1986. Opposée à l'accord du Vendredi saint de 1997, elle continue son action armée jusqu'à aujourd'hui.\\n La Real Irish Republican Army est une scission opposée au processus de paix de la Provisional Irish Republican Army, apparue en 1997 et encore active aujourd'hui.\\n LIrish Republican Liberation Army naît en 2006 d'une scission de la Continuity Irish Republican Army''.\\n\\nGénéalogie de l'Irish Republican Army\"}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "it = iter(fr['train'])\n", - "for i in range(5):\n", - " next(it)\n", - "query = next(it)\n", - "query" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take the first line from the above text body:\n", - "```\n", - "L'Armée républicaine irlandaise (, IRA ; ) est le nom porté, depuis le début du , par plusieurs organisations paramilitaires luttant par les armes contre la présence britannique en Irlande du Nord.\n", - "```\n", - "This translates to the following in english\n", - "```\n", - "The Irish Republican Army (, IRA; ) is the name worn, since the beginning of the 19th century, by several paramilitary organizations fighting with arms against the British presence in Northern Ireland.\n", - "```\n", - "\n", - "Let us now see what at the results that are semantically closer to this in our dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import getpass\n", - "import lancedb\n", - "\n", - "if \"COHERE_API_KEY\" not in os.environ:\n", - " os.environ['COHERE_API_KEY'] = getpass.getpass(\"Enter your Cohere API key: \")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can now load the table even in a different session and anything ingest or search will be automatically vectorized. Let us now run the query." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "469 ms ± 39.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%%timeit\n", - "\n", - "db = lancedb.connect(\"~/lancedb\")\n", - "tbl = db.open_table(\"wikipedia-cohere\") # We just open the existing\n", - "rs = tbl.search(query[\"text\"]).limit(3).to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " **TEXT id-french-12** \n", - " L'Armée républicaine irlandaise (, IRA ; ) est le nom porté, depuis le début du , par plusieurs organisations paramilitaires luttant par les armes contre la présence britannique en Irlande du Nord. Les différents groupes se référent à eux comme Óglaigh na hÉireann (« volontaires d'Irlande »).\n", - "\n", - " L' appelée aussi Old IRA, issue de l'union en 1916 entre l' (proche du Parti travailliste irlandais) et les Irish Volunteers (alors généralement proches de l'IRB), est active entre et , pendant la guerre d'indépendance irlandaise. Si ceux qui ont accepté le traité anglo-irlandais forment les Forces de Défense irlandaises, une partie de l'organisation, refusant cet accord, se constitue en une nouvelle Irish Republican Army, illégale.\n", - " L'Irish Republican Army anti-traité apparaît entre avril et du fait du refus du traité anglo-irlandais par une partie de l'Old IRA. Elle participe ainsi à la guerre civile irlandaise de à . Elle maintient son activité dans les deux Irlandes (État libre d'Irlande, indépendant, et Irlande du Nord, britannique), mais concentre son action sur les intérêts britanniques, surtout en Irlande du Nord. En 1969 l'organisation se divise, donnant naissance à lOfficial Irish Republican Army et à la Provisional Irish Republican Army, minoritaire, moins socialiste et plus activiste.\n", - " LOfficial Irish Republican Army, proche de l'''Official Sinn Féin, plus socialiste et moins nationaliste que la Provisional Irish Republican Army, mène des campagnes d'attentats principalement entre 1969 et 1972 durant le conflit nord-irlandais, avant de décréter un cessez-le-feu.\n", - " La Provisional Irish Republican Army, minoritaire après la scission de 1969 (d'où son nom de provisional, « provisoire ») devient rapidement grâce à son militantisme la principale organisation armée républicaine du conflit nord-irlandais. Le terme de provisional est d'ailleurs abandonné vers la fin des années 1970. Elle fut active de 1969 à 1997 (date du cessez-le-feu définitif), puis déposa définitivement les armes en 2005. Refusant le processus de paix, deux organisations scissionnèrent d'avec la PIRA : la Real Irish Republican Army et la Continuity Irish Republican Army.\n", - " La Continuity Irish Republican Army est issue d'une scission d'avec la Provisional Irish Republican Army dès 1986. Opposée à l'accord du Vendredi saint de 1997, elle continue son action armée jusqu'à aujourd'hui.\n", - " La Real Irish Republican Army est une scission opposée au processus de paix de la Provisional Irish Republican Army, apparue en 1997 et encore active aujourd'hui.\n", - " LIrish Republican Liberation Army naît en 2006 d'une scission de la Continuity Irish Republican Army''.\n", - "\n", - "Généalogie de l'Irish Republican Army \n", - "\n", - " **TEXT id-english-14732** \n", - " The Irish Republican Army (IRA; ) was an Irish republican revolutionary paramilitary organisation. The ancestor of many groups also known as the Irish Republican Army, and distinguished from them as the \"Old IRA\", it was descended from the Irish Volunteers, an organisation established on 25 November 1913 that staged the Easter Rising in April 1916. In 1919, the Irish Republic that had been proclaimed during the Easter Rising was formally established by an elected assembly (Dáil Éireann), and the Irish Volunteers were recognised by Dáil Éireann as its legitimate army. Thereafter, the IRA waged a guerrilla campaign against the British occupation of Ireland in the 1919–1921 Irish War of Independence.\n", - "\n", - "Following the signing in 1921 of the Anglo-Irish Treaty, which ended the War of Independence, a split occurred within the IRA. Members who supported the treaty formed the nucleus of the Irish National Army. However, the majority of the IRA was opposed to the treaty. The anti-treaty IRA fought a civil war against the Free State Army in 1922–23, with the intention of creating a fully independent all-Ireland republic. Having lost the civil war, this group remained in existence, with the intention of overthrowing the governments of both the Irish Free State and Northern Ireland and achieving the Irish Republic proclaimed in 1916.\n", - "\n", - "Origins\n", - "\n", - "The Irish Volunteers, founded in 1913, staged the Easter Rising, which aimed at ending British rule in Ireland, in 1916. Following the suppression of the Rising, thousands of Volunteers were imprisoned or interned, leading to the break-up of the organisation. It was reorganised in 1917 following the release of first the internees and then the prisoners. At the army convention held in Dublin in October 1917, Éamon de Valera was elected president, Michael Collins Director for Organisation and Cathal Brugha Chairman of the Resident Executive, which in effect made him Chief of Staff.\n", - "\n", - "Following the success of Sinn Féin in the general election of 1918 and the setting up of the First Dáil (the legislature of the Irish Republic), Volunteers commenced military action against the Royal Irish Constabulary (RIC), the paramilitary police force in Ireland, and subsequently against the British Army. It began with the Soloheadbeg Ambush, when members of the Third Tipperary Brigade led by Séumas Robinson, Seán Treacy, Dan Breen and Seán Hogan, seized a quantity of gelignite, killing two RIC constables in the process.\n", - "\n", - "The Dáil leadership worried that the Volunteers would not accept its authority, given that, under their own constitution, they were bound to obey their own executive and no other body. In August 1919, Brugha proposed to the Dáil that the Volunteers be asked to swear allegiance to the Dáil, but one commentator states that another year passed before the movement took an oath of allegiance to the Irish Republic and its government in \"August 1920\". In sharp contrast, a contemporary in the struggle for Irish independence notes that by late 1919, the term \"Irish Republican Army (IRA)\" was replacing \"Volunteers\" in everyday usage. This change is attributed to the Volunteers, having accepted the authority of the Dáil, being referred to as the \"army of the Irish Republic\", popularly known as the \"Irish Republican Army\".\n", - "\n", - "A power struggle continued between Brugha and Collins, both cabinet ministers, over who had the greater influence. Brugha was nominally the superior as Minister for Defence, but Collins's power base came from his position as Director of Organisation of the IRA and from his membership on the Supreme Council of the Irish Republican Brotherhood (IRB). De Valera resented Collins's clear power and influence, which he saw as coming more from the secretive IRB than from his position as a Teachta Dála (TD) and minister in the Aireacht. Brugha and de Valera both urged the IRA to undertake larger, more conventional military actions for the propaganda effect but were ignored by Collins and Mulcahy. Brugha at one stage proposed the assassination of the entire British cabinet. This was also discounted due to its presumed negative effect on British public opinion. Moreover, many members of the Dáil, notably Arthur Griffith, did not approve of IRA violence and would have preferred a campaign of passive resistance to the British rule. The Dáil belatedly accepted responsibility for IRA actions in April 1921, just three months before the end of the Irish War of Independence.\n", - "\n", - "In practice, the IRA was commanded by Collins, with Richard Mulcahy as second in command. These men were able to issue orders and directives to IRA guerrilla units around the country and at times to send arms and organisers to specific areas. However, because of the localised and irregular character of the war, they were only able to exert limited control over local IRA commanders such as Tom Barry, Liam Lynch in Cork and Seán Mac Eoin in Longford.\n", - "\n", - "The IRA claimed a total strength of 70,000, but only about 3,000 were actively engaged in fighting against the Crown. The IRA distrusted those Irishmen who had fought in the British Army during the First World War as potential informers, but there were a number of exceptions such as Emmet Dalton, Tom Barry and Martin Doyle. The IRA divided its members into three classes, namely \"unreliable\", \"reliable\" and \"active\". The \"unreliable\" members were those who were nominally IRA members but did not do very much for the struggle, \"reliable\" members played a supporting role in the war while occasionally fighting and the \"active\" men those who were engaged in full-time fighting. Of the IRA brigades only about one to two-thirds were considered to be \"reliable\" while those considered \"active\" were even smaller. A disproportionate number of the \"active\" IRA men were teachers, medical students, shoemakers and bootmakers; those engaged in building trades like painters, carpenters and bricklayers; draper's assistants and creamery workers. The Canadian historian Peter Hart wrote \"...the guerrillas were disproportionately skilled, trained and urban\". Farmers and fishermen tended to be underrepresented in the IRA. Those Irishmen engaged in white-collar trades or working as skilled labourers were much more likely to be involved in cultural nationalist groups like the Gaelic League than farmers or fishermen, and thus to have a stronger sense of Irish nationalism. Furthermore, the authority of the Crown tended to be stronger in towns and cities than in the countryside. Thus, those engaged in Irish nationalist activities in urban areas were much more likely to come into conflict with the Crown, leading to a greater chance of radicalisation. Finally, the British tactic of blowing up the homes of IRA members had the effect of discouraging many farmers from joining the struggle as the destruction of the family farm could easily reduce a farmer and his family to destitution. Of the \"active\" IRA members, three-quarters were in their late teens or early 20s and only 5% of the \"active\" men were in the age range of 40 or older. The \"active\" members were overwhelmingly single men with only 4% being married or engaged in a relationship. The life of an \"active\" IRA man with its stress of living on the run and constantly being in hiding tended to attract single men who could adjust to this lifestyle far more easily than a man in a relationship. Furthermore, the IRA preferred to recruit single men as it was found that singles could devote themselves more wholeheartedly to the struggle.\n", - "\n", - "Women were active in the republican movement, but almost no women fought with the IRA whose \"active\" members were almost entirely male. The IRA was not a sectarian group and went out of its way to proclaim it was open to all Irishmen, but its membership was largely Catholic with virtually no Protestants serving as \"active\" IRA men. Hart wrote that in his study of the IRA membership that he found only three Protestants serving as \"active\" IRA men between 1919 and 1921. Of the 917 IRA men convicted by British courts under the Defence of the Realm Act in 1919, only one was a Protestant. The majority of those serving in the IRA were practising Catholics, but there was a large minority of \"pagans\" as atheists or non-practising Catholics were known in Ireland. The majority of the IRA men serving in metropolitan Britain were permanent residents with very few sent over from Ireland. The majority of the IRA men operating in Britain were Irish-born, but there a substantial minority who were British-born, something that made them especially insistent on asserting their Irish identity.\n", - "\n", - "Irish War of Independence\n", - "\n", - "IRA campaign and organisation\n", - "\n", - "The IRA fought a guerrilla war against the Crown forces in Ireland from 1919 to July 1921. The most intense period of the war was from November 1920 onwards. The IRA campaign can broadly be split into three phases. The first, in 1919, involved the re-organisation of the Irish Volunteers as a guerrilla army and only sporadic attacks. Organisers such as Ernie O'Malley were sent around the country to set up viable guerrilla units. On paper, there were 100,000 or so Volunteers enrolled after the conscription crisis of 1918. However, only about 15,000 of these participated in the guerrilla war. In 1919, Collins, the IRA's Director of Intelligence, organised the \"Squad\"—an assassination unit based in Dublin which killed police involved in intelligence work (the Irish playwright Brendan Behan's father Stephen Behan was a member of the Squad). Typical of Collins's sardonic sense of humour, the Squad was often referred to as his \"Twelve Apostles\". In addition, there were some arms raids on RIC barracks. By the end of 1919, four Dublin Metropolitan Police and 11 RIC men had been killed. The RIC abandoned most of their smaller rural barracks in late 1919. Around 400 of these were burned in a co-ordinated IRA operation around the country in April 1920.\n", - "\n", - "The second phase of the IRA campaign, roughly from January to July 1920, involved attacks on the fortified police barracks located in the towns. Between January and June 1920, 16 of these were destroyed and 29 badly damaged. Several events of late 1920 greatly escalated the conflict. Firstly, the British declared martial law in parts of the country—allowing for internment and executions of IRA men. Secondly they deployed paramilitary forces, the Black and Tans and Auxiliary Division, and more British Army personnel into the country. Thus, the third phase of the war (roughly August 1920 – July 1921) involved the IRA taking on a greatly expanded British force, moving away from attacking well-defended barracks and instead using ambush tactics. To this end the IRA was re-organised into \"flying columns\"—permanent guerrilla units, usually about 20 strong, although sometimes larger. In rural areas, the flying columns usually had bases in remote mountainous areas.\n", - "\n", - "The most high-profile violence of the war took place in Dublin in November 1920 and is still known as Bloody Sunday. In the early hours of the morning, Collins' \"Squad\" killed fourteen British spies. In reprisal, that afternoon, British forces opened fire on a football crowd at Croke Park, killing 14 civilians. Towards the end of the day, two prominent Republicans and a friend of theirs were arrested and killed by Crown Forces.\n", - "\n", - "While most areas of the country saw some violence in 1919–1921, the brunt of the war was fought in Dublin and the southern province of Munster. In Munster, the IRA carried out a significant number of successful actions against British troops, for instance, the ambushing and killing of 16 of 18 Auxiliaries by Tom Barry's column at Kilmicheal in West Cork in November 1920, or Liam Lynch's men killing 13 British soldiers near Millstreet early in the next year. At the Crossbarry Ambush in March 1921, 100 or so of Barry's men fought a sizeable engagement with a British column of 1,200, escaping from the British encircling manoeuvre. In Dublin, the \"Squad\" and elements of the IRA Dublin Brigade were amalgamated into the \"Active Service Unit\", under Oscar Traynor, which tried to carry out at least three attacks on British troops a day. Usually, these consisted of shooting or grenade attacks on British patrols. Outside Dublin and Munster, there were only isolated areas of intense activity. For instance, the County Longford IRA under Seán Mac Eoin carried out a number of well-planned ambushes and successfully defended the village of Ballinalee against Black and Tan reprisals in a three-hour gun battle. In County Mayo, large-scale guerrilla action did not break out until spring 1921, when two British forces were ambushed at Carrowkennedy and Tourmakeady. Elsewhere, fighting was more sporadic and less intense.\n", - "\n", - "In Belfast, the war had a character all of its own. The city had a Protestant and unionist majority and IRA actions were responded to with reprisals against the Catholic population, including killings (such as the McMahon killings) and the burning of many homes – as on Belfast's Bloody Sunday. The IRA in Belfast and the North generally, although involved in protecting the Catholic community from loyalists and state forces, undertook a retaliatory arson campaign against factories and commercial premises. The violence in Belfast alone, which continued until October 1922 (long after the truce in the rest of the country), claimed the lives of between 400 and 500 people.\n", - "\n", - "In April 1921, the IRA was again reorganised, in line with the Dáil's endorsement of its actions, along the lines of a regular army. Divisions were created based on region, with commanders being given responsibility, in theory, for large geographical areas. In practice, this had little effect on the localised nature of the guerrilla warfare.\n", - "\n", - "In May 1921, the IRA in Dublin attacked and burned the Custom House. The action was a serious setback as five members were killed and eighty captured.\n", - "\n", - "By the end of the war in July 1921, the IRA was hard-pressed by the deployment of more British troops into the most active areas and a chronic shortage of arms and ammunition. It has been estimated that the IRA had only about 3,000 rifles (mostly captured from the British) during the war, with a larger number of shotguns and pistols. An ambitious plan to buy arms from Italy in 1921 collapsed when the money did not reach the arms dealers. Towards the end of the war, some Thompson submachine guns were imported from the United States; however 450 of these were intercepted by the American authorities and the remainder only reached Ireland shortly before the Truce.\n", - "\n", - "By June 1921, Collins' assessment was that the IRA was within weeks, possibly even days, of collapse. It had few weapons or ammunition left. Moreover, almost 5,000 IRA men had been imprisoned or interned and over 500 killed. Collins and Mulcahy estimated that the number of effective guerrilla fighters was down to 2,000–3,000. However, in the summer of 1921, the war was abruptly ended.\n", - "\n", - "The British recruited hundreds of World War I veterans into the RIC and sent them to Ireland. Because there was initially a shortage of RIC uniforms, the veterans at first wore a combination of dark green RIC uniforms and khaki British Army uniforms, which inspired the nickname \"Black and Tans\". The brutality of the Black and Tans is now well-known, although the greatest violence attributed to the Crown's forces was often that of the Auxiliary Division of the Constabulary. One of the strongest critics of the Black and Tans was King George V who in May 1921 told Lady Margery Greenwood that \"he hated the idea of the Black and Tans.\"\n", - "\n", - "The IRA was also involved in the destruction of many stately homes in Munster. The Church of Ireland Gazette recorded numerous instances of Unionists and Loyalists being shot, burnt or forced from their homes during the early 1920s. In County Cork between 1920 and 1923 the IRA shot over 200 civilians of whom over 70 (or 36%) were Protestants: five times the percentage of Protestants in the civilian population. This was due to the historical inclination of Protestants towards loyalty to the United Kingdom. A convention of Irish Protestant Churches in Dublin in May 1922 signed a resolution placing \"on record\" that \"hostility to Protestants by reason of their religion has been almost, if not wholly, unknown in the twenty-six counties in which Protestants are in the minority.\"\n", - "\n", - "Many historic buildings in Ireland were destroyed during the war, most famously the Custom House in Dublin, which was disastrously attacked on de Valera's insistence, to the horror of the more militarily experienced Collins. As he feared, the destruction proved a pyrrhic victory for the Republic, with so many IRA men killed or captured that the IRA in Dublin suffered a severe blow.\n", - "\n", - "This was also a period of social upheaval in Ireland, with frequent strikes as well as other manifestations of class conflict. In this regard, the IRA acted to a large degree as an agent of social control and stability, driven by the need to preserve cross-class unity in the national struggle, and on occasion being used to break strikes.\n", - "\n", - "Assessments of the effectiveness of the IRA's campaign vary. They were never in a position to engage in conventional warfare. The political, military and financial costs of remaining in Ireland were higher than the British government was prepared to pay and this in a sense forced them into negotiations with the Irish political leaders. According to historian Michael Hopkinson, the guerrilla warfare \"was often courageous and effective\". Historian David Fitzpatrick observes, \"The guerrilla fighters...were vastly outnumbered by the forces of the Crown... The success of the Irish Volunteers in surviving so long is therefore noteworthy.\"\n", - "\n", - "Truce and treaty\n", - "\n", - "David Lloyd George, the British Prime Minister, at the time, found himself under increasing pressure (both internationally and from within the British Isles) to try to salvage something from the situation. This was a complete reversal on his earlier position. He had consistently referred to the IRA as a \"murder gang\" up until then. An unexpected olive branch came from King George V, who, in a speech in Belfast called for reconciliation on all sides, changed the mood and enabled the British and Irish Republican governments to agree to a truce. The Truce was agreed on 11 July 1921. On 8 July, de Valera met General Nevil Macready, the British commander in chief in Ireland and agreed terms. The IRA was to retain its arms and the British Army was to remain in barracks for the duration of peace negotiations. Many IRA officers interpreted the truce only as a temporary break in fighting. They continued to recruit and train volunteers, with the result that the IRA had increased its number to over 72,000 men by early 1922.\n", - "\n", - "Negotiations on an Anglo-Irish Treaty took place in late 1921 in London. The Irish delegation was led by Arthur Griffith and Michael Collins.\n", - "\n", - "The most contentious areas of the Treaty for the IRA were abolition of the Irish Republic declared in 1919, the status of the Irish Free State as a dominion in the British Commonwealth and the British retention of the so-called Treaty Ports on Ireland's south coast. These issues were the cause of a split in the IRA and ultimately, the Irish Civil War.\n", - "\n", - "Under the Government of Ireland Act 1920, Ireland was partitioned, creating Northern Ireland and Southern Ireland. Under the terms of the Anglo-Irish agreement of 6 December 1921, which ended the war (1919–21), Northern Ireland was given the option of withdrawing from the new state, the Irish Free State, and remaining part of the United Kingdom. The Northern Ireland parliament chose to do that. An Irish Boundary Commission was then set up to review the border.\n", - "\n", - "Irish leaders expected that it would so reduce Northern Ireland's size, by transferring nationalist areas to the Irish Free State, as to make it economically unviable. Partition was not by itself the key breaking point between pro- and anti-Treaty campaigners; both sides expected the Boundary Commission to greatly reduce Northern Ireland. Moreover, Michael Collins was planning a clandestine guerrilla campaign against the Northern state using the IRA. In early 1922, he sent IRA units to the border areas and sent arms to northern units. It was only afterwards, when partition was confirmed, that a united Ireland became the preserve of anti-Treaty Republicans.\n", - "\n", - "IRA and the Anglo-Irish Treaty\n", - "\n", - "The IRA leadership was deeply divided over the decision by the Dáil to ratify the Treaty. Despite the fact that Michael Collins – the de facto leader of the IRA – had negotiated the Treaty, many IRA officers were against it. Of the General Headquarters (GHQ) staff, nine members were in favour of the Treaty while four opposed it. The majority of the IRA rank-and-file were against the Treaty; in January–June 1922, their discontent developed into open defiance of the elected civilian Provisional government of Ireland.\n", - "\n", - "Both sides agreed that the IRA's allegiance was to the (elected) Dáil of the Irish Republic, but the anti-Treaty side argued that the decision of the Dáil to accept the Treaty (and set aside the Irish Republic) meant that the IRA no longer owed that body its allegiance. They called for the IRA to withdraw from the authority of the Dáil and to entrust the IRA Executive with control over the army. On 16 January, the first IRA division – the 2nd Southern Division led by Ernie O'Malley – repudiated the authority of the GHQ. A month later, on 18 February, Liam Forde, O/C of the IRA Mid-Limerick Brigade, issued a proclamation stating that: \"We no longer recognise the authority of the present head of the army, and renew our allegiance to the existing Irish Republic\". This was the first unit of the IRA to break with the pro-Treaty government.\n", - "\n", - "On 22 March, Rory O'Connor held what was to become an infamous press conference and declared that the IRA would no longer obey the Dáil as (he said) it had violated its Oath to uphold the Irish Republic. He went on to say that \"we repudiate the Dáil ... We will set up an Executive which will issue orders to the IRA all over the country.\" In reply to the question on whether this meant they intended to create a military dictatorship, O'Connor said: \"You can take it that way if you like.\"\n", - "\n", - "On 28 March, the (anti-Treaty) IRA Executive issued statement stating that Minister of Defence (Richard Mulcahy) and the Chief-of-Staff (Eoin O'Duffy) no longer exercised any control over the IRA. In addition, it ordered an end to the recruitment to the new military and police forces of the Provisional Government. Furthermore, it instructed all IRA units to reaffirm their allegiance to the Irish Republic on 2 April.\n", - "The stage was set for civil war over the Treaty.\n", - "\n", - "Civil War\n", - "\n", - "The pro-treaty IRA soon became the nucleus of the new (regular) Irish National Army created by Collins and Richard Mulcahy. British pressure, and tensions between the pro- and anti-Treaty factions of the IRA, led to a bloody civil war, ending in the defeat of the anti-Treaty faction. On 24 May 1923, Frank Aiken, the (anti-treaty) IRA Chief-of-Staff, called a cease-fire. Many left political activity altogether, but a minority continued to insist that the new Irish Free State, created by the \"illegitimate\" Treaty, was an illegitimate state. They asserted that their \"IRA Army Executive\" was the real government of a still-existing Irish Republic. The IRA of the Civil War and subsequent organisations that have used the name claim lineage from that group, which is covered in full at Irish Republican Army (1922–1969).\n", - "\n", - "For information on later organisations using the name Irish Republican Army, see the table below. For a genealogy of organisations using the name IRA after 1922, see List of organisations known as the Irish Republican Army.\n", - "\n", - "See also\n", - "List of films featuring the Irish Republican Army\n", - "\n", - "References\n", - "\n", - "Bibliography\n", - "\n", - "Further reading\n", - "\n", - "External links\n", - "\n", - "Bureau of Military History, 1913-1921 at militaryarchives.ie\n", - "Irish Volunteers History, 1913-1922 at IVCO\n", - "\n", - " \n", - "Institutions of the Irish Republic (1919–1922)\n", - "Guerrilla organizations\n", - "Irish republican militant groups\n", - "National liberation armies\n", - "Anti-imperialism in Europe \n", - "\n", - " **TEXT id-english-5859** \n", - " The Continuity Irish Republican Army (Continuity IRA or CIRA), styling itself as the Irish Republican Army (), is an Irish republican paramilitary group that aims to bring about a united Ireland. It claims to be a direct continuation of the original Irish Republican Army and the national army of the Irish Republic that was proclaimed in 1916. It emerged from a split in the Provisional IRA in 1986 but did not become active until the Provisional IRA ceasefire of 1994. It is an illegal organisation in the Republic of Ireland and is designated a terrorist organisation in the United Kingdom, New Zealand and the United States. It has links with the political party Republican Sinn Féin (RSF).\n", - "\n", - "Since 1994, the CIRA has waged a campaign in Northern Ireland against the British Army and the Police Service of Northern Ireland (PSNI), formerly the Royal Ulster Constabulary. This is part of a wider campaign against the British security forces by dissident republican paramilitaries. It has targeted the security forces in gun attacks and bombings, as well as with grenades, mortars and rockets. The CIRA has also carried out bombings with the goal of causing economic harm and/or disruption, as well as many punishment attacks on alleged criminals.\n", - "\n", - "To date, it has been responsible for the death of one PSNI officer. The CIRA is smaller and less active than the Real IRA, and there have been a number of splits within the organisation since the mid-2000s.\n", - "\n", - "Origins\n", - "The Continuity IRA has its origins in a split in the Provisional IRA. In September 1986, the Provisional IRA held a General Army Convention (GAC), the organisation's supreme decision-making body. It was the first GAC in 16 years. The meeting, which like all such meetings was secret, was convened to discuss among other resolutions, the articles of the Provisional IRA constitution which dealt with abstentionism, specifically its opposition to the taking of seats in Dáil Éireann (the parliament of the Republic of Ireland). The GAC passed motions (by the necessary two-thirds majority) allowing members of the Provisional IRA to discuss and debate the taking of parliamentary seats, and the removal of the ban on members of the organisation from supporting any successful republican candidate who took their seat in Dáil Éireann.\n", - "\n", - "The Provisional IRA convention delegates opposed to the change in the constitution claimed that the convention was gerrymandered \"by the creation of new IRA organisational structures for the convention, including the combinations of Sligo-Roscommon-Longford and Wicklow-Wexford-Waterford.\" The only IRA body that supported this viewpoint was the outgoing IRA Executive. Those members of the outgoing Executive who opposed the change comprised a quorum. They met, dismissed those in favour of the change, and set up a new Executive. They contacted Tom Maguire, who was a commander in the old IRA and had supported the Provisionals against the Official IRA (see Irish republican legitimatism), and asked him for support. Maguire had also been contacted by supporters of Gerry Adams, then president of Sinn Féin, and a supporter of the change in the Provisional IRA constitution.\n", - "\n", - "Maguire rejected Adams' supporters, supported the IRA Executive members opposed to the change, and named the new organisers the Continuity Army Council. In a 1986 statement, he rejected \"the legitimacy of an Army Council styling itself the Council of the Irish Republican Army which lends support to any person or organisation styling itself as Sinn Féin and prepared to enter the partition parliament of Leinster House.\" In 1987, Maguire described the \"Continuity Executive\" as the \"lawful Executive of the Irish Republican Army.\"\n", - "\n", - "Campaign\n", - "\n", - "Initially, the Continuity IRA did not reveal its existence, either in the form of press statements or paramilitary activity. Although the Garda Síochána had suspicions that the organisation existed, they were unsure of its name, labelling it the \"Irish National Republican Army\". On 21 January 1994, on the 75th anniversary of the First Dáil Éireann, Continuity IRA volunteers offered a \"final salute\" to Tom Maguire by firing over his grave, and a public statement and a photo were published in Saoirse Irish Freedom. In February 1994 it was reported that in previous months Gardaí had found arms dumps along the Cooley Peninsula in County Louth that did not belong to the Provisional IRA, and forensics tests determined had been used for firing practice recently.\n", - "\n", - "It was only after the Provisional IRA declared a ceasefire in 1994 that the Continuity IRA became active, announcing its intention to continue the campaign against British rule. The CIRA continues to oppose the Good Friday Agreement and, unlike the Provisional IRA (and the Real IRA in 1998), the CIRA has not announced a ceasefire or agreed to participate in weapons decommissioning—nor is there any evidence that it will. In the 18th Independent Monitoring Commission's report, the RIRA, the CIRA and the Irish National Liberation Army (INLA) were deemed a potential future threat. The CIRA was labelled \"active, dangerous and committed and... capable of a greater level of violent and other crime\". Like the RIRA and RIRA splinter group Óglaigh na hÉireann, it too sought funds for expansion. It is also known to have worked with the INLA.\n", - "\n", - "The CIRA has been involved in a number of bombing and shooting incidents. Targets of the CIRA have included the British military, the Northern Ireland police (both the Royal Ulster Constabulary and its successor the Police Service of Northern Ireland). Since the Good Friday Agreement in 1998 the CIRA, along with other paramilitaries opposing the ceasefire, have been involved with a countless number of punishment shootings and beatings. By 2005 the CIRA was believed to be an established presence on the island of Great Britain with the capability of launching attacks. A bomb defused in Dublin in December 2005 was believed to have been the work of the CIRA. In February 2006, the Independent Monitoring Commission (IMC) blamed the CIRA for planting four bombs in Northern Ireland during the final quarter of 2005, as well as several hoax bomb warnings. The IMC also blamed the CIRA for the killings of two former CIRA members in Belfast, who had stolen CIRA weapons and established a rival organisation.\n", - "\n", - "The CIRA continued to be active in both planning and undertaking attacks on the PSNI. The IMC said they tried to lure police into ambushes, while they have also taken to stoning and using petrol bombs. In addition, other assaults, robbery, tiger kidnapping, extortion, fuel laundering and smuggling were undertaken by the group. The CIRA also actively took part in recruiting and training members, including disgruntled former Provisional IRA members. As a result of this continued activity the IMC said the group remained \"a very serious threat\".\n", - "\n", - "On 10 March 2009 the CIRA claimed responsibility for the fatal shooting of a PSNI officer in Craigavon, County Armagh—the first police fatality in Northern Ireland since 1998. The officer was fatally shot by a sniper as he and a colleague investigated \"suspicious activity\" at a house nearby when a window was smashed by youths causing the occupant to phone the police. The PSNI officers responded to the emergency call, giving a CIRA sniper the chance to shoot and kill officer Stephen Carroll. Carroll was killed two days after the Real IRA's 2009 Massereene Barracks shooting at Massereene Barracks in Antrim. In a press interview with Republican Sinn Féin some days later, regarded by some to be the political wing of the Continuity IRA, Richard Walsh described the attacks as \"acts of war\".\n", - "\n", - "In 2013, the Continuity IRA's 'South Down Brigade' threatened a Traveller family in Newry and published a statement in the local newspaper. There were negotiations with community representatives and the CIRA announced the threat was lifted. It was believed the threat was issued after a Traveller feud which resulted in a pipe bomb attack in Bessbrook, near Newry. The Continuity IRA is believed to be strongest in the County Fermanagh – North County Armagh area (Craigavon, Armagh and Lurgan). It is believed to be behind a number of attacks such as pipe bombings, rocket attacks, gun attacks, and the PSNI claimed it orchestrated riots a number of times to lure police officers into areas such as Kilwilkie in Lurgan and Drumbeg in Craigavon in order to attack them. It also claimed the group orchestrated a riot during a security alert in Lurgan. The alert turned out to be a hoax.\n", - "\n", - "On Easter 2016, the Continuity IRA marched in paramilitary uniforms through North Lurgan, Co Armagh, without any hindrance from the PSNI who monitored the parade from a police helicopter.\n", - "\n", - "In July and August 2019 the CIRA carried out attempted bomb attacks on the PSNI in Craigavon, County Armagh and Wattlebridge, County Fermanagh.\n", - "\n", - "On 5 February 2020, a bomb planted by the CIRA was found by the PSNI in a lorry in Lurgan. The CIRA believed the lorry was going to be put on a North Channel ferry to Scotland in January 2020.\n", - "\n", - "Claim to legitimacy\n", - " Similar to the claim put forward by the Provisional IRA after its split from the Official IRA in 1969, the Continuity IRA claims to be the legitimate continuation of the original Irish Republican Army or Óglaigh na hÉireann. This argument is based on the view that the surviving anti-Treaty members of the Second Dáil delegated their \"authority\" to the IRA Army Council in 1938. As further justification for this claim, Tom Maguire, one of those anti-Treaty members of the Second Dáil, issued a statement in favour of the Continuity IRA, just as he had done in 1969 in favour of the Provisionals. J. Bowyer Bell, in his The Irish Troubles, describes Maguire's opinion in 1986: \"abstentionism was a basic tenet of republicanism, a moral issue of principle. Abstentionism gave the movement legitimacy, the right to wage war, to speak for a Republic all but established in the hearts of the people\". Maguire's stature was such that a delegation from Gerry Adams sought his support in 1986, but was rejected.Robert W. White, Ruairí Ó Brádaigh, The Life and Politics of an Irish Revolutionary, 2006, p. 310.\n", - "\n", - "Relationship to other organisations\n", - "These changes within the IRA were accompanied by changes on the political side and at the 1986 Sinn Féin Ard Fheis (party conference), which followed the IRA Convention, the party's policy of abstentionism, which forbade Sinn Féin elected representatives from taking seats in the Oireachtas, the parliament of the Republic, was dropped. On 2 November, the 628 delegates present cast their votes, the result being 429 to 161. The traditionalists, having lost at both conventions, walked out of the Mansion House, met that evening at the West County Hotel, and reformed as Republican Sinn Féin (RSF).\n", - "\n", - "According to a report in the Cork Examiner, the Continuity IRA's first chief of staff was Dáithí Ó Conaill, who also served as the first chairman of RSF from 1986 to 1987. The Continuity IRA and RSF perceive themselves as forming a \"true\" Republican Movement.\n", - "\n", - "Structure and status\n", - "The leadership of the Continuity IRA is believed to be based in the provinces of Munster and Ulster. It was alleged that its chief of staff was a Limerick man and that a number of other key members were from that county, until their expulsion. Dáithí Ó Conaill was the first chief of staff until 1991. In 2004 the United States (US) government believed the Continuity IRA consisted of fewer than fifty hardcore activists. In 2005, Irish Minister for Justice, Equality and Law Reform Michael McDowell told Dáil Éireann that the organisation had a maximum of 150 members.\n", - "\n", - "The CIRA is an illegal organisation under UK (section 11(1) of the Terrorism Act 2000) and ROI law due to the use of 'IRA' in the group's name, in a situation analogous to that of the Real Irish Republican Army (RIRA). Membership of the organisation is punishable by a sentence of up to ten years imprisonment under UK law. On 31 May 2001 Dermot Gannon became the first person to be convicted of membership of the CIRA solely on the word of a Garda Síochána chief superintendent. On 13 July 2004, the US government designated the CIRA as a 'Foreign Terrorist Organization'. This made it illegal for Americans to provide material support to the CIRA, requires US financial institutions to block the group's assets and denies alleged CIRA members visas into the US.\n", - "\n", - "External aid and arsenal\n", - "The US government suspects the Continuity IRA of having received funds and arms from supporters in the United States. Security sources in Ireland have expressed the suspicion that, in co-operation with the RIRA, the Continuity IRA may have acquired arms and materiel from the Balkans. They also suspect that the Continuity IRA arsenal contains some weapons that were taken from Provisional IRA arms dumps, including a few dozen rifles, machine guns, and pistols; a small amount of the explosive Semtex; and a few dozen detonators.\n", - "\n", - "Internal tension and splits\n", - "In 2005, several members of the CIRA, who were serving prison sentences in Portlaoise Prison for paramilitary activity, left the organisation. Some transferred to the INLA landing of the prison, but the majority of those who left are now independent and on E4 landing. The remaining CIRA prisoners have moved to D Wing. Supporters of the Continuity IRA leadership claim that this resulted from an internal disagreement, which although brought to a conclusion, was followed by some people leaving the organisation anyway. Supporters of the disaffected members established the Concerned Group for Republican Prisoners. Most of those who had left went back to the CIRA, or dissociated themselves from the CGRP, which is now defunct.\n", - "\n", - "In February 2006, the Independent Monitoring Commission claimed in a report on paramilitary activity that two groups, styling themselves as \"Óglaigh na hÉireann\" and \"Saoirse na hÉireann\", had been formed after a split in the Continuity IRA either in early 2006 or late 2005. The Óglaigh na hÉireann group was responsible for a number of pipe bomb attacks on the PSNI, bomb hoaxes, and robberies, the IMC also claimed the organisation was responsible for the killing of Andrew Burns on 12 February 2008 and was seeking to recruit former members of the RIRA. The Saoirse na hÉireann (SNH) group was composed of \"disaffected and largely young republicans\" and was responsible for a number of bomb hoaxes, two of which took place in September 2006. It was thought to have operated largely in republican areas of Belfast . The groups had apparently ceased operations by early 2009.\n", - "\n", - "In 2007, the Continuity IRA was responsible for shooting dead two of its members who had left and attempted to create their own organisation. Upon leaving the CIRA, they had allegedly taken a number of guns with them. The Continuity IRA is believed by Gardaí to have been involved in a number of gangland killings in Dublin and Limerick.\n", - "\n", - "In July 2010, members of a \"militant Northern-based faction within the CIRA\" led by a well-known member from south Londonderry claimed to have overthrown the leadership of the organisation. They also claimed that an Army Convention representing \"95 per cent of volunteers\" had unanimously elected a new 12-member Army Executive, which in turn appointed a new seven-member Army Council. The moves came as a result of dissatisfication with the southern-based leadership and the apparent winding-down of military operations. A senior source from RSF said: \"We would see them [the purported new leadership] as just another splinter group that has broken away.\" This organisation is referred to as the Real CIRA.\n", - "\n", - "In June 2011 CIRA member Liam Kenny was murdered, allegedly by drug dealers, at his home in Clondalkin, West Dublin. On 28 November 2011 an innocent man was mistakenly shot dead in retaliation for the murder of Liam Kenny. Limerick Real IRA volunteer Rose Lynch pleaded guilty to this murder at the Special Criminal Court and was sentenced to life imprisonment.\n", - "\n", - "In July 2012 the CIRA announced it had a new leadership after expelling members who had been working against the organisation.\n", - "\n", - "In April 2014 a former leading member of the Belfast Continuity IRA who had been expelled from the organisation, Tommy Crossan, was shot dead.\n", - "\n", - "In popular culture\n", - "The CIRA are depicted in RTÉ's TV series crime drama Love/Hate''.\n", - "\n", - "Notes\n", - "\n", - "References\n", - "\n", - " \n", - "Irish republican militant groups\n", - "Organised crime groups in Ireland\n", - "1986 establishments in Ireland \n", - "\n" - ] - } - ], - "source": [ - "for r in rs:\n", - " print(f\" **TEXT id-{r['id']}** \\n {r['text']} \\n\")\n", - "#" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see in the above result, the closest match is the text itself that we used to search. The second closest match in an English text with a similar semantic meaning referring to IRA. This is what a multi-lingual embedding model can do.\n", - "\n", - "Find more examples on [VectorDB-recipes](https://github.com/lancedb/vectordb-recipes) repo" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/src/notebooks/multi_modal_video_RAG.ipynb b/docs/src/notebooks/multi_modal_video_RAG.ipynb deleted file mode 100644 index b9ef08bf..00000000 --- a/docs/src/notebooks/multi_modal_video_RAG.ipynb +++ /dev/null @@ -1,569 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open\n", - "\n", - "![example](https://github.com/raghavdixit99/assets/assets/34462078/361596dc-e839-4957-9345-eaa43a49b4b0)\n", - "\n", - "# Multimodal RAG for video processing using LlamaIndex, OpenAI GPT4V \n", - "\n", - "In this notebook, we showcase a Multimodal RAG architecture designed for video processing. We utilize OpenAI GPT4V MultiModal LLM class that employs [CLIP](https://github.com/openai/CLIP) to generate multimodal embeddings. Furthermore, we use [LanceDBVectorStore](https://docs.llamaindex.ai/en/latest/examples/vector_stores/LanceDBIndexDemo.html#) for efficient vector storage.\n", - "\n", - "\n", - "Steps:\n", - "1. Download video from YouTube, process and store it.\n", - "\n", - "2. Build Multi-Modal index and vector store for both texts and images.\n", - "\n", - "3. Retrieve relevant images and context, use both to augment the prompt.\n", - "\n", - "4. Using GPT4V for reasoning the correlations between the input query and augmented data and generating final response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install llama-index-vector-stores-lancedb\n", - "%pip install llama-index-multi-modal-llms-openai" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install llama-index-multi-modal-llms-openai\n", - "%pip install llama-index-vector-stores-lancedb\n", - "%pip install llama-index-embeddings-clip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install llama_index ftfy regex tqdm\n", - "%pip install -U openai-whisper\n", - "%pip install git+https://github.com/openai/CLIP.git\n", - "%pip install torch torchvision\n", - "%pip install matplotlib scikit-image\n", - "%pip install lancedb\n", - "%pip install moviepy\n", - "%pip install pytube\n", - "%pip install pydub\n", - "%pip install SpeechRecognition\n", - "%pip install ffmpeg-python\n", - "%pip install soundfile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from moviepy.editor import VideoFileClip\n", - "from pathlib import Path\n", - "import speech_recognition as sr\n", - "from pytube import YouTube\n", - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "OPENAI_API_TOKEN = \"\"\n", - "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_TOKEN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Set configuration for input below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "video_url = \"https://www.youtube.com/watch?v=d_qvLDhkg00\"\n", - "output_video_path = \"./video_data/\"\n", - "output_folder = \"./mixed_data/\"\n", - "output_audio_path = \"./mixed_data/output_audio.wav\"\n", - "\n", - "filepath = output_video_path + \"input_vid.mp4\"\n", - "Path(output_folder).mkdir(parents=True, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Download and process videos into appropriate format for generating/storing embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "import matplotlib.pyplot as plt\n", - "import os\n", - "\n", - "\n", - "def plot_images(image_paths):\n", - " images_shown = 0\n", - " plt.figure(figsize=(16, 9))\n", - " for img_path in image_paths:\n", - " if os.path.isfile(img_path):\n", - " image = Image.open(img_path)\n", - "\n", - " plt.subplot(2, 3, images_shown + 1)\n", - " plt.imshow(image)\n", - " plt.xticks([])\n", - " plt.yticks([])\n", - "\n", - " images_shown += 1\n", - " if images_shown >= 7:\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def download_video(url, output_path):\n", - " \"\"\"\n", - " Download a video from a given url and save it to the output path.\n", - "\n", - " Parameters:\n", - " url (str): The url of the video to download.\n", - " output_path (str): The path to save the video to.\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the metadata of the video.\n", - " \"\"\"\n", - " yt = YouTube(url)\n", - " metadata = {\"Author\": yt.author, \"Title\": yt.title, \"Views\": yt.views}\n", - " yt.streams.get_highest_resolution().download(\n", - " output_path=output_path, filename=\"input_vid.mp4\"\n", - " )\n", - " return metadata\n", - "\n", - "\n", - "def video_to_images(video_path, output_folder):\n", - " \"\"\"\n", - " Convert a video to a sequence of images and save them to the output folder.\n", - "\n", - " Parameters:\n", - " video_path (str): The path to the video file.\n", - " output_folder (str): The path to the folder to save the images to.\n", - "\n", - " \"\"\"\n", - " clip = VideoFileClip(video_path)\n", - " clip.write_images_sequence(\n", - " os.path.join(output_folder, \"frame%04d.png\"), fps=0.2\n", - " )\n", - "\n", - "\n", - "def video_to_audio(video_path, output_audio_path):\n", - " \"\"\"\n", - " Convert a video to audio and save it to the output path.\n", - "\n", - " Parameters:\n", - " video_path (str): The path to the video file.\n", - " output_audio_path (str): The path to save the audio to.\n", - "\n", - " \"\"\"\n", - " clip = VideoFileClip(video_path)\n", - " audio = clip.audio\n", - " audio.write_audiofile(output_audio_path)\n", - "\n", - "\n", - "def audio_to_text(audio_path):\n", - " \"\"\"\n", - " Convert audio to text using the SpeechRecognition library.\n", - "\n", - " Parameters:\n", - " audio_path (str): The path to the audio file.\n", - "\n", - " Returns:\n", - " test (str): The text recognized from the audio.\n", - "\n", - " \"\"\"\n", - " recognizer = sr.Recognizer()\n", - " audio = sr.AudioFile(audio_path)\n", - "\n", - " with audio as source:\n", - " # Record the audio data\n", - " audio_data = recognizer.record(source)\n", - "\n", - " try:\n", - " # Recognize the speech\n", - " text = recognizer.recognize_whisper(audio_data)\n", - " except sr.UnknownValueError:\n", - " print(\"Speech recognition could not understand the audio.\")\n", - " except sr.RequestError as e:\n", - " print(f\"Could not request results from service; {e}\")\n", - "\n", - " return text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " metadata_vid = download_video(video_url, output_video_path)\n", - " video_to_images(filepath, output_folder)\n", - " video_to_audio(filepath, output_audio_path)\n", - " text_data = audio_to_text(output_audio_path)\n", - "\n", - " with open(output_folder + \"output_text.txt\", \"w\") as file:\n", - " file.write(text_data)\n", - " print(\"Text data saved to file\")\n", - " file.close()\n", - " os.remove(output_audio_path)\n", - " print(\"Audio file removed\")\n", - "\n", - "except Exception as e:\n", - " raise e" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create the multi-modal index " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core.indices import MultiModalVectorStoreIndex\n", - "from llama_index.core import SimpleDirectoryReader, StorageContext\n", - "\n", - "from llama_index.core import SimpleDirectoryReader, StorageContext\n", - "from llama_index.vector_stores.lancedb import LanceDBVectorStore\n", - "\n", - "\n", - "from llama_index.core import SimpleDirectoryReader\n", - "\n", - "text_store = LanceDBVectorStore(uri=\"lancedb\", table_name=\"text_collection\")\n", - "image_store = LanceDBVectorStore(uri=\"lancedb\", table_name=\"image_collection\")\n", - "storage_context = StorageContext.from_defaults(\n", - " vector_store=text_store, image_store=image_store\n", - ")\n", - "\n", - "# Create the MultiModal index\n", - "documents = SimpleDirectoryReader(output_folder).load_data()\n", - "\n", - "index = MultiModalVectorStoreIndex.from_documents(\n", - " documents,\n", - " storage_context=storage_context,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Use index as retriever to fetch top k (5 in this example) results from the multimodal vector index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "retriever_engine = index.as_retriever(\n", - " similarity_top_k=5, image_similarity_top_k=5\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Set the RAG prompt template " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "metadata_str = json.dumps(metadata_vid)\n", - "\n", - "qa_tmpl_str = (\n", - " \"Given the provided information, including relevant images and retrieved context from the video, \\\n", - " accurately and precisely answer the query without any additional prior knowledge.\\n\"\n", - " \"Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\\n\"\n", - " \"---------------------\\n\"\n", - " \"Context: {context_str}\\n\"\n", - " \"Metadata for video: {metadata_str} \\n\"\n", - " \"---------------------\\n\"\n", - " \"Query: {query_str}\\n\"\n", - " \"Answer: \"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Retrieve most similar text/image embeddings baseed on user query from the DB" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core.response.notebook_utils import display_source_node\n", - "from llama_index.core.schema import ImageNode\n", - "\n", - "\n", - "def retrieve(retriever_engine, query_str):\n", - " retrieval_results = retriever_engine.retrieve(query_str)\n", - "\n", - " retrieved_image = []\n", - " retrieved_text = []\n", - " for res_node in retrieval_results:\n", - " if isinstance(res_node.node, ImageNode):\n", - " retrieved_image.append(res_node.node.metadata[\"file_path\"])\n", - " else:\n", - " display_source_node(res_node, source_length=200)\n", - " retrieved_text.append(res_node.text)\n", - "\n", - " return retrieved_image, retrieved_text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Add query now, fetch relevant details including images and augment the prompt template " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "**Node ID:** bda08ef1-137c-4d69-9bcc-b7005a41a13c
**Similarity:** 0.7431071996688843
**Text:** The basic function underlying a normal distribution, aka a Gaussian, is e to the negative x squared. But you might wonder why this function? Of all the expressions we could dream up that give you s...
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Node ID:** 7d6d0f32-ce16-461b-be54-883241252e50
**Similarity:** 0.7335695028305054
**Text:** This step is actually pretty technical, it goes a little beyond what I want to talk about here. Often use these objects called moment generating functions, that gives you a very abstract argument t...
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Node ID:** 519fb788-3927-4842-ad5c-88be61deaf65
**Similarity:** 0.7069740295410156
**Text:** The essence of what we want to compute is what the convolution between two copies of this function looks like. If you remember, in the last video, we had two different ways to visualize convolution...
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Node ID:** f265c3fb-3c9f-4f36-aa2a-fb15efff9783
**Similarity:** 0.706935465335846
**Text:** This is the important point. All of the stuff that's involving s is now entirely separate from the integrated variable. This remaining integral is a little bit tricky. I did a whole video on it. It...
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "query_str = \"Using examples from video, explain all things covered in the video regarding the gaussian function\"\n", - "\n", - "img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)\n", - "image_documents = SimpleDirectoryReader(\n", - " input_dir=output_folder, input_files=img\n", - ").load_data()\n", - "context_str = \"\".join(txt)\n", - "plot_images(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Generate final response using GPT4V" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('The video by 3Blue1Brown, titled \"A pretty reason why Gaussian + Gaussian = '\n", - " 'Gaussian,\" covers several aspects of the Gaussian function, also known as '\n", - " \"the normal distribution. Here's a summary of the key points discussed in the \"\n", - " 'video:\\n'\n", - " '\\n'\n", - " '1. **Central Limit Theorem**: The video begins by discussing the central '\n", - " 'limit theorem, which states that the sum of multiple copies of a random '\n", - " 'variable tends to look like a normal distribution. As the number of '\n", - " 'variables increases, the approximation to a normal distribution becomes '\n", - " 'better.\\n'\n", - " '\\n'\n", - " '2. **Convolution of Random Variables**: The process of adding two random '\n", - " 'variables is mathematically represented by a convolution of their respective '\n", - " 'distributions. The video explains the concept of convolution and how it is '\n", - " 'used to find the distribution of the sum of two random variables.\\n'\n", - " '\\n'\n", - " '3. **Gaussian Function**: The Gaussian function is more complex than just '\n", - " '\\\\( e^{-x^2} \\\\). The full formula includes a scaling factor to ensure the '\n", - " 'area under the curve is 1 (making it a valid probability distribution), a '\n", - " 'standard deviation parameter \\\\( \\\\sigma \\\\) to describe the spread, and a '\n", - " 'mean parameter \\\\( \\\\mu \\\\) to shift the center. However, the video focuses '\n", - " 'on centered distributions with \\\\( \\\\mu = 0 \\\\).\\n'\n", - " '\\n'\n", - " '4. **Visualizing Convolution**: The video presents a visual method to '\n", - " 'understand the convolution of two Gaussian functions using diagonal slices '\n", - " 'on the xy-plane. This method involves looking at the probability density of '\n", - " 'landing on a point (x, y) as \\\\( f(x) \\\\times g(y) \\\\), where f and g are '\n", - " 'the two distributions being convolved.\\n'\n", - " '\\n'\n", - " '5. **Rotational Symmetry**: A key property of the Gaussian function is its '\n", - " 'rotational symmetry, which is unique to bell curves. This symmetry is '\n", - " 'exploited in the video to simplify the calculation of the convolution. By '\n", - " 'rotating the graph 45 degrees, the computation becomes easier because the '\n", - " 'integral only involves one variable.\\n'\n", - " '\\n'\n", - " '6. **Result of Convolution**: The video demonstrates that the convolution of '\n", - " 'two Gaussian functions is another Gaussian function. This is a special '\n", - " 'property because convolutions typically result in a different kind of '\n", - " 'function. The standard deviation of the resulting Gaussian is \\\\( \\\\sqrt{2} '\n", - " '\\\\times \\\\sigma \\\\) if the original Gaussians had the same standard '\n", - " 'deviation.\\n'\n", - " '\\n'\n", - " '7. **Proof of Central Limit Theorem**: The video explains that the '\n", - " 'convolution of two Gaussians being another Gaussian is a crucial step in '\n", - " 'proving the central limit theorem. It shows that the Gaussian function is a '\n", - " 'fixed point in the space of distributions, and since all distributions with '\n", - " 'finite variance tend towards a single universal shape, that shape must be '\n", - " 'the Gaussian.\\n'\n", - " '\\n'\n", - " '8. **Connection to Pi**: The video also touches on the connection between '\n", - " 'the Gaussian function and the number Pi, which appears in the formula for '\n", - " 'the normal distribution.\\n'\n", - " '\\n'\n", - " 'The video aims to provide an intuitive geometric argument for why the sum of '\n", - " 'two normally distributed random variables is also normally distributed, and '\n", - " 'how this relates to the central limit theorem and the special properties of '\n", - " 'the Gaussian function.')\n" - ] - } - ], - "source": [ - "from llama_index.multi_modal_llms.openai import OpenAIMultiModal\n", - "\n", - "openai_mm_llm = OpenAIMultiModal(\n", - " model=\"gpt-4-vision-preview\", api_key=OPENAI_API_TOKEN, max_new_tokens=1500\n", - ")\n", - "\n", - "\n", - "response_1 = openai_mm_llm.complete(\n", - " prompt=qa_tmpl_str.format(\n", - " context_str=context_str, query_str=query_str, metadata_str=metadata_str\n", - " ),\n", - " image_documents=image_documents,\n", - ")\n", - "\n", - "pprint(response_1.text)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/src/notebooks/reproducibility.ipynb b/docs/src/notebooks/reproducibility.ipynb deleted file mode 100644 index 44bca99b..00000000 --- a/docs/src/notebooks/reproducibility.ipynb +++ /dev/null @@ -1,1173 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c0de1e6a-61f7-4f99-a2fd-1461902ab36a", - "metadata": {}, - "source": [ - "# Sync API\n", - "\n", - "Reproducibility is critical for AI. For code, it's easy to keep track of changes using Github or Gitlab.\n", - "For data, it's not as easy. Most of the time, we're manually writing complicated data tracking code, wrestling with an external tool, and dealing with expensive duplicate snapshot copies with low granularity.\n", - "\n", - "While working with most other vector databases, if we loaded in the wrong data (or any other such mistakes), we have to blow away the index, correct the mistake, and then completely rebuild it. It's **really difficult** to rollback to an earlier state, and any such corrective action **destroys historical data and evidence**, which may be useful down the line to debug and diagnose issues.\n", - "\n", - "To our knowledge, LanceDB is the first and only vector database that supports full reproducibility and rollbacks natively.\n", - "Taking advantage of the Lance columnar data format, LanceDB supports:\n", - "- Automatic versioning\n", - "- Instant rollback\n", - "- Appends, updates, deletions\n", - "- Schema evolution\n", - "\n", - "This makes auditing, tracking, and reproducibility a breeze!\n", - "\n", - "Let's see how this all works." - ] - }, - { - "cell_type": "markdown", - "id": "cafebbce-d324-485d-90ec-503695875f47", - "metadata": {}, - "source": [ - "## Pickle Rick!" - ] - }, - { - "cell_type": "markdown", - "id": "14605311", - "metadata": {}, - "source": [ - "Let's first prepare the data. We will be using a CSV file with a bunch of quotes from Rick and Morty" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c02976c7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-12-17 11:54:43-- http://vectordb-recipes.s3.us-west-2.amazonaws.com/rick_and_morty_quotes.csv\n", - "Resolving vectordb-recipes.s3.us-west-2.amazonaws.com (vectordb-recipes.s3.us-west-2.amazonaws.com)... 52.92.138.34, 3.5.82.160, 52.218.236.161, ...\n", - "Connecting to vectordb-recipes.s3.us-west-2.amazonaws.com (vectordb-recipes.s3.us-west-2.amazonaws.com)|52.92.138.34|:80... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 8236 (8.0K) [text/csv]\n", - "Saving to: ‘rick_and_morty_quotes.csv.1’\n", - "\n", - "rick_and_morty_quot 100%[===================>] 8.04K --.-KB/s in 0s \n", - "\n", - "2024-12-17 11:54:43 (77.8 MB/s) - ‘rick_and_morty_quotes.csv.1’ saved [8236/8236]\n", - "\n", - "id,author,quote\n", - "1,Rick,\" Morty, you got to come on. You got to come with me.\"\n", - "2,Morty,\" Rick, what’s going on?\"\n", - "3,Rick,\" I got a surprise for you, Morty.\"\n", - "4,Morty,\" It’s the middle of the night. What are you talking about?\"\n", - "5,Rick,\" I got a surprise for you.\"\n", - "6,Morty,\" Ow! Ow! You’re tugging me too hard.\"\n", - "7,Rick,\" I got a surprise for you, Morty.\"\n", - "8,Rick,\" What do you think of this flying vehicle, Morty? I built it out of stuff I found in the garage.\"\n", - "9,Morty,\" Yeah, Rick, it’s great. Is this the surprise?\"\n" - ] - } - ], - "source": [ - "!wget http://vectordb-recipes.s3.us-west-2.amazonaws.com/rick_and_morty_quotes.csv\n", - "!head rick_and_morty_quotes.csv" - ] - }, - { - "cell_type": "markdown", - "id": "533c5f58", - "metadata": {}, - "source": [ - "Let's load this into a pandas dataframe.\n", - "\n", - "It's got 3 columns, a quote id, the quote string, and the first name of the author of the quote:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ee1443e3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
01RickMorty, you got to come on. You got to come wi...
12MortyRick, what’s going on?
23RickI got a surprise for you, Morty.
34MortyIt’s the middle of the night. What are you ta...
45RickI got a surprise for you.
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 1 Rick Morty, you got to come on. You got to come wi...\n", - "1 2 Morty Rick, what’s going on?\n", - "2 3 Rick I got a surprise for you, Morty.\n", - "3 4 Morty It’s the middle of the night. What are you ta...\n", - "4 5 Rick I got a surprise for you." - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "df = pd.read_csv(\"rick_and_morty_quotes.csv\")\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "0e74818f-109e-4b09-b5f8-dd1875c512e3", - "metadata": {}, - "source": [ - "We'll start with a local LanceDB connection" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fa27ab30", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install lancedb -q" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f57d988-56b9-4384-8a7b-000d5f91034a", - "metadata": {}, - "outputs": [], - "source": [ - "import lancedb\n", - "db = lancedb.connect(\"~/.lancedb\")" - ] - }, - { - "cell_type": "markdown", - "id": "4ba9ffac-c779-49e3-91a7-f1c00f3fda41", - "metadata": {}, - "source": [ - "Creating a LanceDB table from a pandas dataframe is straightforward using `create_table`:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "bd981f6d-b921-4b1d-b63a-6c1d59f3a51d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
01RickMorty, you got to come on. You got to come wi...
12MortyRick, what’s going on?
23RickI got a surprise for you, Morty.
34MortyIt’s the middle of the night. What are you ta...
45RickI got a surprise for you.
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 1 Rick Morty, you got to come on. You got to come wi...\n", - "1 2 Morty Rick, what’s going on?\n", - "2 3 Rick I got a surprise for you, Morty.\n", - "3 4 Morty It’s the middle of the night. What are you ta...\n", - "4 5 Rick I got a surprise for you." - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.drop_table(\"rick_and_morty\", ignore_missing=True)\n", - "table = db.create_table(\"rick_and_morty\", df)\n", - "table.head().to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "38d055be-ae3e-4190-b1cf-abf14cdf8975", - "metadata": {}, - "source": [ - "## Updates" - ] - }, - { - "cell_type": "markdown", - "id": "842550fb-da81-44ea-9e98-d5dbaa6916c7", - "metadata": {}, - "source": [ - "Now, since Rick is the smartest man in the multiverse, he deserves to have his quotes attributed to his full name: Richard Daniel Sanchez.\n", - "\n", - "This can be done via `LanceTable.update`. It needs two arguments:\n", - "\n", - "1. A `where` string filter (sql syntax) to determine the rows to update\n", - "2. A dict of `values` where the keys are the column names to update and the values are the new values" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "9eac4708-a8c4-49aa-bc13-8e60c5bf34a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
02MortyRick, what’s going on?
14MortyIt’s the middle of the night. What are you ta...
26MortyOw! Ow! You’re tugging me too hard.
39MortyYeah, Rick, it’s great. Is this the surprise?
411MortyWhat?! A bomb?!
............
9480Richard Daniel SanchezThere you are, Morty. Listen to me. I got an ...
9582Richard Daniel SanchezIt’s pretty obvious, Morty. I froze him. Now ...
9684Richard Daniel SanchezDo you have any concept of how much higher th...
9786Richard Daniel SanchezI’ll do it later, Morty. He’ll be fine. Let’s...
9897Richard Daniel SanchezThere she is. All right. Come on, Morty. Let’...
\n", - "

99 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " id author \\\n", - "0 2 Morty \n", - "1 4 Morty \n", - "2 6 Morty \n", - "3 9 Morty \n", - "4 11 Morty \n", - ".. .. ... \n", - "94 80 Richard Daniel Sanchez \n", - "95 82 Richard Daniel Sanchez \n", - "96 84 Richard Daniel Sanchez \n", - "97 86 Richard Daniel Sanchez \n", - "98 97 Richard Daniel Sanchez \n", - "\n", - " quote \n", - "0 Rick, what’s going on? \n", - "1 It’s the middle of the night. What are you ta... \n", - "2 Ow! Ow! You’re tugging me too hard. \n", - "3 Yeah, Rick, it’s great. Is this the surprise? \n", - "4 What?! A bomb?! \n", - ".. ... \n", - "94 There you are, Morty. Listen to me. I got an ... \n", - "95 It’s pretty obvious, Morty. I froze him. Now ... \n", - "96 Do you have any concept of how much higher th... \n", - "97 I’ll do it later, Morty. He’ll be fine. Let’s... \n", - "98 There she is. All right. Come on, Morty. Let’... \n", - "\n", - "[99 rows x 3 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.update(where=\"author='Rick'\", values={\"author\": \"Richard Daniel Sanchez\"})\n", - "table.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "ac6499ce-af6d-4934-9051-be5f159ce623", - "metadata": {}, - "source": [ - "## Schema evolution" - ] - }, - { - "cell_type": "markdown", - "id": "0402226b-6d0c-41c5-9257-069c4bf16825", - "metadata": {}, - "source": [ - "Ok so this is a vector database, so we need actual vectors.\n", - "We'll use sentence transformers here to avoid having to deal with API keys." - ] - }, - { - "cell_type": "markdown", - "id": "85db4ed9-8f80-4b56-9867-1381fa1c4c7d", - "metadata": {}, - "source": [ - "Let's create a basic model using the \"all-MiniLM-L6-v2\" model and embed the quotes:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "998f4eb5-31cd-49ae-9f7c-2ec4d6652ef6", - "metadata": {}, - "outputs": [], - "source": [ - "from sentence_transformers import SentenceTransformer\n", - "model = SentenceTransformer(\"all-MiniLM-L6-v2\", device=\"cpu\")\n", - "vectors = model.encode(df.quote.values.tolist(),\n", - " convert_to_numpy=True,\n", - " normalize_embeddings=True).tolist()" - ] - }, - { - "cell_type": "markdown", - "id": "539e2a0e-529b-439b-ba8c-a388907c4860", - "metadata": {}, - "source": [ - "We can then convert the vectors into a pyarrow Table and merge it to the LanceDB Table.\n", - "\n", - "For the merge to work successfully, we need to have an overlapping column. Here the natural choice is to use the id column:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "ccbea593-85cf-484c-989f-9836a31c7906", - "metadata": {}, - "outputs": [], - "source": [ - "from lance.vector import vec_to_table\n", - "import numpy as np\n", - "import pyarrow as pa" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "727c8230-7e41-436a-8666-60ee46e7041b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectorid
0[-0.10369808, -0.038807657, -0.07471153, -0.05...1
1[-0.11813704, -0.0533092, 0.025554786, -0.0242...2
2[-0.09807682, -0.035231438, -0.04206024, -0.06...3
3[0.032292824, 0.038136397, 0.013615396, 0.0335...4
4[-0.050369408, -0.0043397923, 0.013419108, -0....5
\n", - "
" - ], - "text/plain": [ - " vector id\n", - "0 [-0.10369808, -0.038807657, -0.07471153, -0.05... 1\n", - "1 [-0.11813704, -0.0533092, 0.025554786, -0.0242... 2\n", - "2 [-0.09807682, -0.035231438, -0.04206024, -0.06... 3\n", - "3 [0.032292824, 0.038136397, 0.013615396, 0.0335... 4\n", - "4 [-0.050369408, -0.0043397923, 0.013419108, -0.... 5" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeddings = vec_to_table(vectors)\n", - "embeddings = embeddings.append_column(\"id\", pa.array(np.arange(len(table))+1))\n", - "embeddings.to_pandas().head()" - ] - }, - { - "cell_type": "markdown", - "id": "518da48d-6481-4c1e-8ba4-800d5e0542cf", - "metadata": {}, - "source": [ - "And now we'll use the `LanceTable.merge` function to add the vector column into the LanceTable:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a4326a70-9863-47e8-8f3f-565e35d558cf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquotevector
02MortyRick, what’s going on?[-0.11813704, -0.0533092, 0.025554786, -0.0242...
14MortyIt’s the middle of the night. What are you ta...[0.032292824, 0.038136397, 0.013615396, 0.0335...
26MortyOw! Ow! You’re tugging me too hard.[-0.035019904, -0.070963725, 0.003859435, -0.0...
39MortyYeah, Rick, it’s great. Is this the surprise?[-0.12578955, -0.019364933, 0.01606114, -0.082...
411MortyWhat?! A bomb?![0.0018287548, 0.07033146, -0.023754105, 0.047...
\n", - "
" - ], - "text/plain": [ - " id author quote \\\n", - "0 2 Morty Rick, what’s going on? \n", - "1 4 Morty It’s the middle of the night. What are you ta... \n", - "2 6 Morty Ow! Ow! You’re tugging me too hard. \n", - "3 9 Morty Yeah, Rick, it’s great. Is this the surprise? \n", - "4 11 Morty What?! A bomb?! \n", - "\n", - " vector \n", - "0 [-0.11813704, -0.0533092, 0.025554786, -0.0242... \n", - "1 [0.032292824, 0.038136397, 0.013615396, 0.0335... \n", - "2 [-0.035019904, -0.070963725, 0.003859435, -0.0... \n", - "3 [-0.12578955, -0.019364933, 0.01606114, -0.082... \n", - "4 [0.0018287548, 0.07033146, -0.023754105, 0.047... " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.merge(embeddings, left_on=\"id\")\n", - "table.head().to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "f590fec8-0ed0-4148-b940-c81abe7b421c", - "metadata": {}, - "source": [ - "If we look at the schema, we see that `all-MiniLM-L6-v2` produces 384-dimensional vectors:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ca9596a0-b4a0-4a5e-8d9e-967cd13b1eae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id: int64\n", - "author: string\n", - "quote: string\n", - "vector: fixed_size_list[384]\n", - " child 0, item: float" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.schema" - ] - }, - { - "cell_type": "markdown", - "id": "f046002c-872c-4c39-ab85-e03c3b45b477", - "metadata": {}, - "source": [ - "## Rollback\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "dbfc298c-ada2-411b-925f-e53dc9d35f3c", - "metadata": {}, - "source": [ - "Suppose we used the table and found that the `all-MiniLM-L6-v2` model doesn't produce ideal results. Instead we want to try a larger model. How do we use the new embeddings without losing the change history?" - ] - }, - { - "cell_type": "markdown", - "id": "dfb116e4-b3b2-4b7e-bbf8-d3e63ca2aa14", - "metadata": {}, - "source": [ - "First, major operations are automatically versioned in LanceDB.\n", - "Version 1 is the table creation, with the initial insertion of data.\n", - "Versions 2 and 3 represents the update (deletion + append)\n", - "Version 4 is adding the new column." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a411902b-43d0-4889-8e34-bc5f3c409726", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'version': 1,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 57, 21, 613932),\n", - " 'metadata': {}},\n", - " {'version': 2,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 57, 21, 626525),\n", - " 'metadata': {}},\n", - " {'version': 3,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 57, 27, 91378),\n", - " 'metadata': {}},\n", - " {'version': 4,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 58, 4, 513085),\n", - " 'metadata': {}}]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.list_versions()" - ] - }, - { - "cell_type": "markdown", - "id": "7bd5e954-ac0f-4973-81c6-ad6120412d40", - "metadata": {}, - "source": [ - "We can restore version 3, before we added the old vector column" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ad0682cc-7599-459c-bbd8-1cd1f296c845", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
02MortyRick, what’s going on?
14MortyIt’s the middle of the night. What are you ta...
26MortyOw! Ow! You’re tugging me too hard.
39MortyYeah, Rick, it’s great. Is this the surprise?
411MortyWhat?! A bomb?!
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 2 Morty Rick, what’s going on?\n", - "1 4 Morty It’s the middle of the night. What are you ta...\n", - "2 6 Morty Ow! Ow! You’re tugging me too hard.\n", - "3 9 Morty Yeah, Rick, it’s great. Is this the surprise?\n", - "4 11 Morty What?! A bomb?!" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.restore(3)\n", - "table.head().to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "b0a51146-40d0-4f16-9555-5ce68c2c9eee", - "metadata": {}, - "source": [ - "Notice that we now have one more, not less versions. When we restore an old version, we're not deleting the version history, we're just creating a new version where the schema and data is equivalent to the restored old version. In this way, we can keep track of all of the changes and always rollback to a previous state." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "d5bfb448-20b9-45e9-90ba-8a73abb86668", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'version': 1,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 57, 21, 613932),\n", - " 'metadata': {}},\n", - " {'version': 2,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 57, 21, 626525),\n", - " 'metadata': {}},\n", - " {'version': 3,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 57, 27, 91378),\n", - " 'metadata': {}},\n", - " {'version': 4,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 58, 4, 513085),\n", - " 'metadata': {}},\n", - " {'version': 5,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 11, 58, 27, 153807),\n", - " 'metadata': {}}]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.list_versions()" - ] - }, - { - "cell_type": "markdown", - "id": "6713cb53-8cb9-4235-9c55-337c311f0af6", - "metadata": {}, - "source": [ - "### Switching Models\n", - "\n", - "Now we'll switch to the `all-mpnet-base-v2` model and add the vectors to the restored dataset again. Note that this step can take a couple of minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fa2950d-3002-4903-b6c3-2760ce60d079", - "metadata": {}, - "outputs": [], - "source": [ - "model = SentenceTransformer(\"all-mpnet-base-v2\", device=\"cpu\")\n", - "vectors = model.encode(df.quote.values.tolist(),\n", - " convert_to_numpy=True,\n", - " normalize_embeddings=True).tolist()\n", - "embeddings = vec_to_table(vectors)\n", - "embeddings = embeddings.append_column(\"id\", pa.array(np.arange(len(table))+1))\n", - "table.merge(embeddings, left_on=\"id\")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "694c46e0-a1c3-4869-a1eb-562f14606ad4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id: int64\n", - "author: string\n", - "quote: string\n", - "vector: fixed_size_list[768]\n", - " child 0, item: float" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.schema" - ] - }, - { - "cell_type": "markdown", - "id": "5e4085a5-a2e7-4520-acfc-eabaae2caa7d", - "metadata": {}, - "source": [ - "## Deletion\n", - "\n", - "What if the whole show was just Rick-isms? \n", - "Let's delete any quote not said by Rick:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "9d11ddf1-b352-496c-91d7-99c70cbf304b", - "metadata": {}, - "outputs": [], - "source": [ - "table.delete(\"author != 'Richard Daniel Sanchez'\")" - ] - }, - { - "cell_type": "markdown", - "id": "77d2f591-e492-423e-b995-2a18ae8cb831", - "metadata": {}, - "source": [ - "We can see that the number of rows has been reduced to 30" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "20bcce48-a5df-43c7-9ab9-7d59a83055e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "28" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(table)" - ] - }, - { - "cell_type": "markdown", - "id": "ef8457b2-1228-4a25-824e-477a07681b48", - "metadata": {}, - "source": [ - "Ok we had our fun, let's get back to the full quote set" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "6e279635-75b0-400c-8b43-4aa069282ccd", - "metadata": {}, - "outputs": [], - "source": [ - "table.restore(6)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "6a65b627-57a2-43b2-8acc-3805591845ad", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "99" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(table)" - ] - }, - { - "cell_type": "markdown", - "id": "ae1a6ee8-8868-49de-82ab-17a0f61f3a47", - "metadata": {}, - "source": [ - "## History\n", - "\n", - "We now have 9 versions in the data. We can review the operations that corresponds to each version below:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f595c9b8-91ec-48c1-9790-c40e1bd24b60", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table.version" - ] - }, - { - "cell_type": "markdown", - "id": "774f4eb0-03d4-4fda-a825-6217bf096619", - "metadata": {}, - "source": [ - "\n", - "Versions:\n", - "- 1 - Create and append\n", - "- 2 - Update (deletion)\n", - "- 3 - Update (append)\n", - "- 4 - Merge (vector column)\n", - "- 5 - Restore (4)\n", - "- 6 - Merge (new vector column)\n", - "- 7 - Deletion\n", - "- 8 - Restore" - ] - }, - { - "cell_type": "markdown", - "id": "fb0131e6-2b73-442a-b4c6-6976a9cf4c7e", - "metadata": {}, - "source": [ - "## Summary" - ] - }, - { - "cell_type": "markdown", - "id": "97a1cf79-b46b-40cd-ada0-54edef358627", - "metadata": {}, - "source": [ - "We never had to explicitly manage the versioning. And we never had to create expensive and slow snapshots. LanceDB automatically tracks the full history of operations and supports fast rollbacks. In production this is critical for debugging issues and minimizing downtime by rolling back to a previously successful state in seconds." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "doc-venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/reproducibility_async.ipynb b/docs/src/notebooks/reproducibility_async.ipynb deleted file mode 100644 index fef18d6a..00000000 --- a/docs/src/notebooks/reproducibility_async.ipynb +++ /dev/null @@ -1,1096 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c0de1e6a-61f7-4f99-a2fd-1461902ab36a", - "metadata": {}, - "source": [ - "# Async API\n", - "\n", - "We demonstrate the following functionalities suppored by LanceDB using our asynchonous APIs:\n", - "- Automatic versioning\n", - "- Instant rollback\n", - "- Appends, updates, deletions\n", - "- Schema evolution" - ] - }, - { - "cell_type": "markdown", - "id": "6d810f29", - "metadata": {}, - "source": [ - "Let's first prepare the data. We will be using a CSV file with a bunch of quotes from Rick and Morty" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "d00ed8e6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-12-17 15:58:31-- http://vectordb-recipes.s3.us-west-2.amazonaws.com/rick_and_morty_quotes.csv\n", - "Resolving vectordb-recipes.s3.us-west-2.amazonaws.com (vectordb-recipes.s3.us-west-2.amazonaws.com)... 3.5.84.162, 3.5.76.76, 52.92.228.138, ...\n", - "Connecting to vectordb-recipes.s3.us-west-2.amazonaws.com (vectordb-recipes.s3.us-west-2.amazonaws.com)|3.5.84.162|:80... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 8236 (8.0K) [text/csv]\n", - "Saving to: ‘rick_and_morty_quotes.csv.3’\n", - "\n", - "rick_and_morty_quot 100%[===================>] 8.04K --.-KB/s in 0s \n", - "\n", - "2024-12-17 15:58:31 (160 MB/s) - ‘rick_and_morty_quotes.csv.3’ saved [8236/8236]\n", - "\n", - "id,author,quote\n", - "1,Rick,\" Morty, you got to come on. You got to come with me.\"\n", - "2,Morty,\" Rick, what’s going on?\"\n", - "3,Rick,\" I got a surprise for you, Morty.\"\n", - "4,Morty,\" It’s the middle of the night. What are you talking about?\"\n", - "5,Rick,\" I got a surprise for you.\"\n", - "6,Morty,\" Ow! Ow! You’re tugging me too hard.\"\n", - "7,Rick,\" I got a surprise for you, Morty.\"\n", - "8,Rick,\" What do you think of this flying vehicle, Morty? I built it out of stuff I found in the garage.\"\n", - "9,Morty,\" Yeah, Rick, it’s great. Is this the surprise?\"\n" - ] - } - ], - "source": [ - "!wget http://vectordb-recipes.s3.us-west-2.amazonaws.com/rick_and_morty_quotes.csv\n", - "!head rick_and_morty_quotes.csv" - ] - }, - { - "cell_type": "markdown", - "id": "a5fcdcda-b0fe-4ac4-90b4-6b42cf2ef34d", - "metadata": {}, - "source": [ - "Let's load this into a pandas dataframe.\n", - "\n", - "It's got 3 columns, a quote id, the quote string, and the first name of the author of the quote:" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "def3ae59-77d9-43f0-ba6d-415a1503856b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
01RickMorty, you got to come on. You got to come wi...
12MortyRick, what’s going on?
23RickI got a surprise for you, Morty.
34MortyIt’s the middle of the night. What are you ta...
45RickI got a surprise for you.
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 1 Rick Morty, you got to come on. You got to come wi...\n", - "1 2 Morty Rick, what’s going on?\n", - "2 3 Rick I got a surprise for you, Morty.\n", - "3 4 Morty It’s the middle of the night. What are you ta...\n", - "4 5 Rick I got a surprise for you." - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "df = pd.read_csv(\"rick_and_morty_quotes.csv\")\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "4ba9ffac-c779-49e3-91a7-f1c00f3fda41", - "metadata": {}, - "source": [ - "Creating a LanceDB table from a pandas dataframe is straightforward using `create_table`" - ] - }, - { - "cell_type": "markdown", - "id": "392cf0ee", - "metadata": {}, - "source": [ - "We'll start with a local LanceDB connection" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "91a322dd", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install lancedb -q" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "10715e72", - "metadata": {}, - "outputs": [], - "source": [ - "import lancedb\n", - "async_db = await lancedb.connect_async(\"~/.lancedb\")" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "bd981f6d-b921-4b1d-b63a-6c1d59f3a51d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2024-12-17T23:58:46Z WARN lance::dataset::write::insert] No existing dataset at ~/.lancedb/rick_and_morty.lance, it will be created\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
01RickMorty, you got to come on. You got to come wi...
12MortyRick, what’s going on?
23RickI got a surprise for you, Morty.
34MortyIt’s the middle of the night. What are you ta...
45RickI got a surprise for you.
56MortyOw! Ow! You’re tugging me too hard.
67RickI got a surprise for you, Morty.
78RickWhat do you think of this flying vehicle, Mor...
89MortyYeah, Rick, it’s great. Is this the surprise?
910RickMorty, I had to I had to I had to I had to ma...
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 1 Rick Morty, you got to come on. You got to come wi...\n", - "1 2 Morty Rick, what’s going on?\n", - "2 3 Rick I got a surprise for you, Morty.\n", - "3 4 Morty It’s the middle of the night. What are you ta...\n", - "4 5 Rick I got a surprise for you.\n", - "5 6 Morty Ow! Ow! You’re tugging me too hard.\n", - "6 7 Rick I got a surprise for you, Morty.\n", - "7 8 Rick What do you think of this flying vehicle, Mor...\n", - "8 9 Morty Yeah, Rick, it’s great. Is this the surprise?\n", - "9 10 Rick Morty, I had to I had to I had to I had to ma..." - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_db.drop_table(\"rick_and_morty\")\n", - "async_table = await async_db.create_table(\"rick_and_morty\", df, mode=\"overwrite\")\n", - "await async_table.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "38d055be-ae3e-4190-b1cf-abf14cdf8975", - "metadata": {}, - "source": [ - "## Updates" - ] - }, - { - "cell_type": "markdown", - "id": "842550fb-da81-44ea-9e98-d5dbaa6916c7", - "metadata": {}, - "source": [ - "Now, since Rick is the smartest man in the multiverse, he deserves to have his quotes attributed to his full name: Richard Daniel Sanchez.\n", - "\n", - "This can be done via `LanceTable.update`. It needs two arguments:\n", - "\n", - "1. A `where` string filter (sql syntax) to determine the rows to update\n", - "2. A dict of `updates` where the keys are the column names to update and the values are the new values" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "9eac4708-a8c4-49aa-bc13-8e60c5bf34a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
01RickMorty, you got to come on. You got to come wi...
13RickI got a surprise for you, Morty.
25RickI got a surprise for you.
37RickI got a surprise for you, Morty.
48RickWhat do you think of this flying vehicle, Mor...
510RickMorty, I had to I had to I had to I had to ma...
612RickWe’re gonna drop it down there just get a who...
714RickCome on, Morty. Just take it easy, Morty. It’...
816RickWhen I drop the bomb you know, I want you to ...
918RickAnd Jessica’s gonna be Eve,…
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 1 Rick Morty, you got to come on. You got to come wi...\n", - "1 3 Rick I got a surprise for you, Morty.\n", - "2 5 Rick I got a surprise for you.\n", - "3 7 Rick I got a surprise for you, Morty.\n", - "4 8 Rick What do you think of this flying vehicle, Mor...\n", - "5 10 Rick Morty, I had to I had to I had to I had to ma...\n", - "6 12 Rick We’re gonna drop it down there just get a who...\n", - "7 14 Rick Come on, Morty. Just take it easy, Morty. It’...\n", - "8 16 Rick When I drop the bomb you know, I want you to ...\n", - "9 18 Rick And Jessica’s gonna be Eve,…" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.update(where=\"author='Morty'\", updates={\"author\": \"Richard Daniel Sanchez\"})\n", - "await async_table.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "ac6499ce-af6d-4934-9051-be5f159ce623", - "metadata": {}, - "source": [ - "## Schema evolution" - ] - }, - { - "cell_type": "markdown", - "id": "022f1334", - "metadata": {}, - "source": [ - "Let's add a `new_id` column to the table, where each value is the original `id` plus 1." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "a4326a70-9863-47e8-8f3f-565e35d558cf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquotenew_id
01RickMorty, you got to come on. You got to come wi...2
13RickI got a surprise for you, Morty.4
25RickI got a surprise for you.6
37RickI got a surprise for you, Morty.8
48RickWhat do you think of this flying vehicle, Mor...9
510RickMorty, I had to I had to I had to I had to ma...11
612RickWe’re gonna drop it down there just get a who...13
714RickCome on, Morty. Just take it easy, Morty. It’...15
816RickWhen I drop the bomb you know, I want you to ...17
918RickAnd Jessica’s gonna be Eve,…19
\n", - "
" - ], - "text/plain": [ - " id author quote new_id\n", - "0 1 Rick Morty, you got to come on. You got to come wi... 2\n", - "1 3 Rick I got a surprise for you, Morty. 4\n", - "2 5 Rick I got a surprise for you. 6\n", - "3 7 Rick I got a surprise for you, Morty. 8\n", - "4 8 Rick What do you think of this flying vehicle, Mor... 9\n", - "5 10 Rick Morty, I had to I had to I had to I had to ma... 11\n", - "6 12 Rick We’re gonna drop it down there just get a who... 13\n", - "7 14 Rick Come on, Morty. Just take it easy, Morty. It’... 15\n", - "8 16 Rick When I drop the bomb you know, I want you to ... 17\n", - "9 18 Rick And Jessica’s gonna be Eve,… 19" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.add_columns({\"new_id\": \"id + 1\"})\n", - "await async_table.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "f590fec8-0ed0-4148-b940-c81abe7b421c", - "metadata": {}, - "source": [ - "If we look at the schema, we see that a new int64 column was added" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "ca9596a0-b4a0-4a5e-8d9e-967cd13b1eae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id: int64\n", - "author: string\n", - "quote: string\n", - "new_id: int64" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.schema()" - ] - }, - { - "cell_type": "markdown", - "id": "f046002c-872c-4c39-ab85-e03c3b45b477", - "metadata": {}, - "source": [ - "## Rollback\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "dbfc298c-ada2-411b-925f-e53dc9d35f3c", - "metadata": {}, - "source": [ - "Suppose we used the table and found that the new column should be a different value. How do we use another new column without losing the change history?" - ] - }, - { - "cell_type": "markdown", - "id": "dfb116e4-b3b2-4b7e-bbf8-d3e63ca2aa14", - "metadata": {}, - "source": [ - "First, major operations are automatically versioned in LanceDB.\n", - "Version 1 is the table creation, with the initial insertion of data.\n", - "Versions 2 and 3 represents the update (deletion + append)\n", - "Version 4 is adding the new column." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "a411902b-43d0-4889-8e34-bc5f3c409726", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'version': 1,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 58, 46, 983259),\n", - " 'metadata': {}},\n", - " {'version': 2,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 59, 0, 291948),\n", - " 'metadata': {}},\n", - " {'version': 3,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 59, 8, 381165),\n", - " 'metadata': {}}]" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.checkout_latest()\n", - "await async_table.list_versions()" - ] - }, - { - "cell_type": "markdown", - "id": "7bd5e954-ac0f-4973-81c6-ad6120412d40", - "metadata": {}, - "source": [ - "We can restore version 3, before we added the `new_id` vector column" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "ad0682cc-7599-459c-bbd8-1cd1f296c845", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idauthorquote
01RickMorty, you got to come on. You got to come wi...
13RickI got a surprise for you, Morty.
25RickI got a surprise for you.
37RickI got a surprise for you, Morty.
48RickWhat do you think of this flying vehicle, Mor...
510RickMorty, I had to I had to I had to I had to ma...
612RickWe’re gonna drop it down there just get a who...
714RickCome on, Morty. Just take it easy, Morty. It’...
816RickWhen I drop the bomb you know, I want you to ...
918RickAnd Jessica’s gonna be Eve,…
\n", - "
" - ], - "text/plain": [ - " id author quote\n", - "0 1 Rick Morty, you got to come on. You got to come wi...\n", - "1 3 Rick I got a surprise for you, Morty.\n", - "2 5 Rick I got a surprise for you.\n", - "3 7 Rick I got a surprise for you, Morty.\n", - "4 8 Rick What do you think of this flying vehicle, Mor...\n", - "5 10 Rick Morty, I had to I had to I had to I had to ma...\n", - "6 12 Rick We’re gonna drop it down there just get a who...\n", - "7 14 Rick Come on, Morty. Just take it easy, Morty. It’...\n", - "8 16 Rick When I drop the bomb you know, I want you to ...\n", - "9 18 Rick And Jessica’s gonna be Eve,…" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.checkout(2)\n", - "await async_table.restore()\n", - "await async_table.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "b0a51146-40d0-4f16-9555-5ce68c2c9eee", - "metadata": {}, - "source": [ - "Notice that we now have one more, not less versions. When we restore an old version, we're not deleting the version history, we're just creating a new version where the schema and data is equivalent to the restored old version. In this way, we can keep track of all of the changes and always rollback to a previous state." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "d5bfb448-20b9-45e9-90ba-8a73abb86668", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'version': 1,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 58, 46, 983259),\n", - " 'metadata': {}},\n", - " {'version': 2,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 59, 0, 291948),\n", - " 'metadata': {}},\n", - " {'version': 3,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 59, 8, 381165),\n", - " 'metadata': {}},\n", - " {'version': 4,\n", - " 'timestamp': datetime.datetime(2024, 12, 17, 15, 59, 22, 800694),\n", - " 'metadata': {}}]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.list_versions()" - ] - }, - { - "cell_type": "markdown", - "id": "6713cb53-8cb9-4235-9c55-337c311f0af6", - "metadata": {}, - "source": [ - "### Add another new column\n", - "\n", - "Now we'll change the value of the `new_id` column and add it to the restored dataset again" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "cdabeb56", - "metadata": {}, - "outputs": [], - "source": [ - "await async_table.add_columns({\"new_id\": \"id + 10\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "694c46e0-a1c3-4869-a1eb-562f14606ad4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id: int64\n", - "author: string\n", - "quote: string\n", - "new_id: int64" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.schema()" - ] - }, - { - "cell_type": "markdown", - "id": "5e4085a5-a2e7-4520-acfc-eabaae2caa7d", - "metadata": {}, - "source": [ - "## Deletion\n", - "\n", - "What if the whole show was just Rick-isms? \n", - "Let's delete any quote not said by Rick" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "9d11ddf1-b352-496c-91d7-99c70cbf304b", - "metadata": {}, - "outputs": [], - "source": [ - "await async_table.delete(\"author != 'Richard Daniel Sanchez'\")" - ] - }, - { - "cell_type": "markdown", - "id": "77d2f591-e492-423e-b995-2a18ae8cb831", - "metadata": {}, - "source": [ - "We can see that the number of rows has been reduced to 30" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "20bcce48-a5df-43c7-9ab9-7d59a83055e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "34" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.count_rows()" - ] - }, - { - "cell_type": "markdown", - "id": "ef8457b2-1228-4a25-824e-477a07681b48", - "metadata": {}, - "source": [ - "Ok we had our fun, let's get back to the full quote set" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "6e279635-75b0-400c-8b43-4aa069282ccd", - "metadata": {}, - "outputs": [], - "source": [ - "await async_table.checkout(5)\n", - "await async_table.restore()" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "6a65b627-57a2-43b2-8acc-3805591845ad", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "99" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.count_rows()" - ] - }, - { - "cell_type": "markdown", - "id": "ae1a6ee8-8868-49de-82ab-17a0f61f3a47", - "metadata": {}, - "source": [ - "## History\n", - "\n", - "We now have 9 versions in the data. We can review the operations that corresponds to each version below:" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "f595c9b8-91ec-48c1-9790-c40e1bd24b60", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await async_table.version()" - ] - }, - { - "cell_type": "markdown", - "id": "774f4eb0-03d4-4fda-a825-6217bf096619", - "metadata": {}, - "source": [ - "\n", - "Versions:\n", - "- 1 - Create\n", - "- 2 - Update\n", - "- 3 - Add a new column\n", - "- 4 - Restore (2)\n", - "- 5 - Add a new column\n", - "- 6 - Delete\n", - "- 7 - Restore" - ] - }, - { - "cell_type": "markdown", - "id": "fb0131e6-2b73-442a-b4c6-6976a9cf4c7e", - "metadata": {}, - "source": [ - "## Summary" - ] - }, - { - "cell_type": "markdown", - "id": "97a1cf79-b46b-40cd-ada0-54edef358627", - "metadata": {}, - "source": [ - "We never had to explicitly manage the versioning. And we never had to create expensive and slow snapshots. LanceDB automatically tracks the full history of operations I created and supports fast rollbacks. In production this is critical for debugging issues and minimizing downtime by rolling back to a previously successful state in seconds." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "doc-venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/tables_guide.ipynb b/docs/src/notebooks/tables_guide.ipynb deleted file mode 100644 index 18d15b4e..00000000 --- a/docs/src/notebooks/tables_guide.ipynb +++ /dev/null @@ -1,836 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d24eb4c6-e246-44ca-ba7c-6eae7923bd4c", - "metadata": {}, - "source": [ - "## LanceDB Tables\n", - "A Table is a collection of Records in a LanceDB Database.\n", - "\n", - "![illustration](../assets/ecosystem-illustration.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "c1b4e34b-a49c-471d-a343-a5940bb5138a", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install lancedb -qq" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4e5a8d07-d9a1-48c1-913a-8e0629289579", - "metadata": {}, - "outputs": [], - "source": [ - "import lancedb\n", - "db = lancedb.connect(\"./.lancedb\")" - ] - }, - { - "cell_type": "markdown", - "id": "66fb93d5-3551-406b-99b2-488442d61d06", - "metadata": {}, - "source": [ - "LanceDB allows ingesting data from various sources - `dict`, `list[dict]`, `pd.DataFrame`, `pa.Table` or a `Iterator[pa.RecordBatch]`. Let's take a look at some of the these.\n", - "\n", - " ### From list of tuples or dictionaries" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "5df12f66-8d99-43ad-8d0b-22189ec0a6b9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "vector: fixed_size_list[2]\n", - " child 0, item: float\n", - "lat: double\n", - "long: double\n", - "----\n", - "vector: [[[1.1,1.2],[0.2,1.8]]]\n", - "lat: [[45.5,40.1]]\n", - "long: [[-122.7,-74.1]]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import lancedb\n", - "\n", - "db = lancedb.connect(\"./.lancedb\")\n", - "\n", - "data = [{\"vector\": [1.1, 1.2], \"lat\": 45.5, \"long\": -122.7},\n", - " {\"vector\": [0.2, 1.8], \"lat\": 40.1, \"long\": -74.1}]\n", - "\n", - "db.create_table(\"my_table\", data)\n", - "\n", - "db[\"my_table\"].head()" - ] - }, - { - "cell_type": "markdown", - "id": "10ce802f-1a10-49ee-8ee3-a9bfb302d86c", - "metadata": {}, - "source": [ - "## From pandas DataFrame\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f4d87ae9-0ccb-48eb-b31d-bb8f2370e47e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "vector: fixed_size_list[2]\n", - " child 0, item: float\n", - "lat: double\n", - "long: double\n", - "----\n", - "vector: [[[1.1,1.2],[0.2,1.8]]]\n", - "lat: [[45.5,40.1]]\n", - "long: [[-122.7,-74.1]]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "data = pd.DataFrame(\n", - " {\n", - " \"vector\": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],\n", - " \"lat\": [45.5, 40.1],\n", - " \"long\": [-122.7, -74.1],\n", - " }\n", - ")\n", - "db.create_table(\"my_table_pandas\", data)\n", - "db[\"my_table_pandas\"].head()" - ] - }, - { - "cell_type": "markdown", - "id": "4be81469-5b57-4f78-9c72-3938c0378d9d", - "metadata": {}, - "source": [ - "Data is converted to Arrow before being written to disk. For maximum control over how data is saved, either provide the PyArrow schema to convert to or else provide a PyArrow Table directly.\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "25f34bcf-fca0-4431-8601-eac95d1bd347", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2024-01-31T18:59:33Z WARN lance::dataset] No existing dataset at /Users/qian/Work/LanceDB/lancedb/docs/src/notebooks/.lancedb/table3.lance, it will be created\n" - ] - }, - { - "data": { - "text/plain": [ - "vector: fixed_size_list[2]\n", - " child 0, item: float\n", - "lat: float\n", - "long: float" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pyarrow as pa\n", - "\n", - "custom_schema = pa.schema([\n", - "pa.field(\"vector\", pa.list_(pa.float32(), 4)),\n", - "pa.field(\"lat\", pa.float32()),\n", - "pa.field(\"long\", pa.float32())\n", - "])\n", - "\n", - "table = db.create_table(\"table3\", data, schema=custom_schema, mode=\"overwrite\")\n", - "table.schema" - ] - }, - { - "cell_type": "markdown", - "id": "4df51925-7ca2-4005-9c72-38b3d26240c6", - "metadata": {}, - "source": [ - "### From an Arrow Table\n", - "\n", - "You can also create LanceDB tables directly from pyarrow tables. LanceDB supports float16 type." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "90a880f6-be43-4c9d-ba65-0b05197c0f6f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "vector: fixed_size_list[16]\n", - " child 0, item: halffloat\n", - "text: string" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "\n", - "dim = 16\n", - "total = 2\n", - "schema = pa.schema(\n", - " [\n", - " pa.field(\"vector\", pa.list_(pa.float16(), dim)),\n", - " pa.field(\"text\", pa.string())\n", - " ]\n", - ")\n", - "data = pa.Table.from_arrays(\n", - " [\n", - " pa.array([np.random.randn(dim).astype(np.float16) for _ in range(total)],\n", - " pa.list_(pa.float16(), dim)),\n", - " pa.array([\"foo\", \"bar\"])\n", - " ],\n", - " [\"vector\", \"text\"],\n", - ")\n", - "\n", - "tbl = db.create_table(\"f16_tbl\", data, schema=schema)\n", - "tbl.schema" - ] - }, - { - "cell_type": "markdown", - "id": "0f36c51c-d902-449d-8292-700e53990c32", - "metadata": {}, - "source": [ - "### From Pydantic Models\n", - "\n", - "LanceDB supports to create Apache Arrow Schema from a Pydantic BaseModel." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d81121d7-e4b7-447c-a48c-974b6ebb464a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "movie_id: int64 not null\n", - "vector: fixed_size_list[128] not null\n", - " child 0, item: float\n", - "genres: string not null\n", - "title: string not null\n", - "imdb_id: int64 not null" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from lancedb.pydantic import Vector, LanceModel\n", - "\n", - "class Content(LanceModel):\n", - " movie_id: int\n", - " vector: Vector(128)\n", - " genres: str\n", - " title: str\n", - " imdb_id: int\n", - " \n", - " @property\n", - " def imdb_url(self) -> str:\n", - " return f\"https://www.imdb.com/title/tt{self.imdb_id}\"\n", - "\n", - "import pyarrow as pa\n", - "db = lancedb.connect(\"~/.lancedb\")\n", - "table_name = \"movielens_small\"\n", - "table = db.create_table(table_name, schema=Content)\n", - "table.schema" - ] - }, - { - "cell_type": "markdown", - "id": "860e1f77-e860-46a9-98b7-b2979092ccd6", - "metadata": {}, - "source": [ - "### Using Iterators / Writing Large Datasets\n", - "\n", - "It is recommended to use itertators to add large datasets in batches when creating your table in one go. This does not create multiple versions of your dataset unlike manually adding batches using `table.add()`\n", - "\n", - "LanceDB additionally supports pyarrow's `RecordBatch` Iterators or other generators producing supported data types.\n", - "\n", - "## Here's an example using using `RecordBatch` iterator for creating tables." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "bc247142-4e3c-41a2-b94c-8e00d2c2a508", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LanceTable(table4)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pyarrow as pa\n", - "\n", - "def make_batches():\n", - " for i in range(5):\n", - " yield pa.RecordBatch.from_arrays(\n", - " [\n", - " pa.array([[3.1, 4.1], [5.9, 26.5]],\n", - " pa.list_(pa.float32(), 2)),\n", - " pa.array([\"foo\", \"bar\"]),\n", - " pa.array([10.0, 20.0]),\n", - " ],\n", - " [\"vector\", \"item\", \"price\"],\n", - " )\n", - "\n", - "schema = pa.schema([\n", - " pa.field(\"vector\", pa.list_(pa.float32(), 2)),\n", - " pa.field(\"item\", pa.utf8()),\n", - " pa.field(\"price\", pa.float32()),\n", - "])\n", - "\n", - "db.create_table(\"table4\", make_batches(), schema=schema)" - ] - }, - { - "cell_type": "markdown", - "id": "94f7dd2b-bae4-4bdf-8534-201437c31027", - "metadata": {}, - "source": [ - "### Using pandas `DataFrame` Iterator and Pydantic Schema\n", - "\n", - "You can set the schema via pyarrow schema object or using Pydantic object" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "25ad3523-e0c9-4c28-b3df-38189c4e0e5f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "vector: fixed_size_list[2] not null\n", - " child 0, item: float\n", - "item: string not null\n", - "price: double not null" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pyarrow as pa\n", - "import pandas as pd\n", - "\n", - "class PydanticSchema(LanceModel):\n", - " vector: Vector(2)\n", - " item: str\n", - " price: float\n", - "\n", - "def make_batches():\n", - " for i in range(5):\n", - " yield pd.DataFrame(\n", - " {\n", - " \"vector\": [[3.1, 4.1], [1, 1]],\n", - " \"item\": [\"foo\", \"bar\"],\n", - " \"price\": [10.0, 20.0],\n", - " })\n", - "\n", - "tbl = db.create_table(\"table5\", make_batches(), schema=PydanticSchema)\n", - "tbl.schema" - ] - }, - { - "cell_type": "markdown", - "id": "4aa955e9-fcd0-4c99-b644-f218f3bb3f1a", - "metadata": {}, - "source": [ - "## Creating Empty Table\n", - "\n", - "You can create an empty table by just passing the schema and later add to it using `table.add()`" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2814173a-eacc-4dd8-a64d-6312b44582cc", - "metadata": {}, - "outputs": [], - "source": [ - "import lancedb\n", - "from lancedb.pydantic import LanceModel, Vector\n", - "\n", - "class Model(LanceModel):\n", - " vector: Vector(2)\n", - "\n", - "tbl = db.create_table(\"table6\", schema=Model.to_arrow_schema())" - ] - }, - { - "cell_type": "markdown", - "id": "1d1b0f5c-a1d9-459f-8614-8376b6f577e1", - "metadata": {}, - "source": [ - "## Open Existing Tables\n", - "\n", - "If you forget the name of your table, you can always get a listing of all table names:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "df9e13c0-41f6-437f-9dfa-2fd71d3d9c45", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['table6', 'table4', 'table5', 'movielens_small']" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.table_names()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "9343f5ad-6024-42ee-ac2f-6c1471df8679", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectoritemprice
0[3.1, 4.1]foo10.0
1[5.9, 26.5]bar20.0
2[3.1, 4.1]foo10.0
3[5.9, 26.5]bar20.0
4[3.1, 4.1]foo10.0
5[5.9, 26.5]bar20.0
6[3.1, 4.1]foo10.0
7[5.9, 26.5]bar20.0
8[3.1, 4.1]foo10.0
9[5.9, 26.5]bar20.0
\n", - "
" - ], - "text/plain": [ - " vector item price\n", - "0 [3.1, 4.1] foo 10.0\n", - "1 [5.9, 26.5] bar 20.0\n", - "2 [3.1, 4.1] foo 10.0\n", - "3 [5.9, 26.5] bar 20.0\n", - "4 [3.1, 4.1] foo 10.0\n", - "5 [5.9, 26.5] bar 20.0\n", - "6 [3.1, 4.1] foo 10.0\n", - "7 [5.9, 26.5] bar 20.0\n", - "8 [3.1, 4.1] foo 10.0\n", - "9 [5.9, 26.5] bar 20.0" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tbl = db.open_table(\"table4\")\n", - "tbl.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "id": "5019246f-12e3-4f78-88a8-9f4939802c76", - "metadata": {}, - "source": [ - "## Adding to table\n", - "After a table has been created, you can always add more data to it using\n", - "\n", - "You can add any of the valid data structures accepted by LanceDB table, i.e, `dict`, `list[dict]`, `pd.DataFrame`, or a `Iterator[pa.RecordBatch]`. Here are some examples." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "8a56250f-73a1-4c26-a6ad-5c7a0ce3a9ab", - "metadata": {}, - "outputs": [], - "source": [ - "data = [\n", - " {\"vector\": [1.3, 1.4], \"item\": \"fizz\", \"price\": 100.0},\n", - " {\"vector\": [9.5, 56.2], \"item\": \"buzz\", \"price\": 200.0}\n", - "]\n", - "tbl.add(data)" - ] - }, - { - "cell_type": "markdown", - "id": "9985f6ee-67e1-45a9-b233-94e3d121ecbf", - "metadata": {}, - "source": [ - "You can also add a large dataset batch in one go using Iterator of supported data types\n", - "\n", - "### Adding via Iterator\n", - "\n", - "here, we'll use pandas DataFrame Iterator" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "030c7057-b98e-4e2f-be14-b8c1f927f83c", - "metadata": {}, - "outputs": [], - "source": [ - "def make_batches():\n", - " for i in range(5):\n", - " yield [\n", - " {\"vector\": [3.1, 4.1], \"item\": \"foo\", \"price\": 10.0},\n", - " {\"vector\": [1, 1], \"item\": \"bar\", \"price\": 20.0},\n", - " ]\n", - "tbl.add(make_batches())" - ] - }, - { - "cell_type": "markdown", - "id": "b8316d5d-0a23-4675-b0ee-178711db873a", - "metadata": {}, - "source": [ - "## Deleting from a Table\n", - "\n", - "Use the `delete()` method on tables to delete rows from a table. To choose which rows to delete, provide a filter that matches on the metadata columns. This can delete any number of rows that match the filter, like:\n", - "\n", - "\n", - "```python\n", - "tbl.delete('item = \"fizz\"')\n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "e7a17de2-08d2-41b7-bd05-f63d1045ab1f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "22\n" - ] - }, - { - "data": { - "text/plain": [ - "12" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(len(tbl))\n", - " \n", - "tbl.delete(\"price = 20.0\")\n", - " \n", - "len(tbl)" - ] - }, - { - "cell_type": "markdown", - "id": "74ac180b-5432-4c14-b1a8-22c35ac83af8", - "metadata": {}, - "source": [ - "### Delete from a list of values" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "fe3310bd-08f4-4a22-a63b-b3127d22f9f7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " vector item price\n", - "0 [3.1, 4.1] foo 10.0\n", - "1 [3.1, 4.1] foo 10.0\n", - "2 [3.1, 4.1] foo 10.0\n", - "3 [3.1, 4.1] foo 10.0\n", - "4 [3.1, 4.1] foo 10.0\n", - "5 [1.3, 1.4] fizz 100.0\n", - "6 [9.5, 56.2] buzz 200.0\n", - "7 [3.1, 4.1] foo 10.0\n", - "8 [3.1, 4.1] foo 10.0\n", - "9 [3.1, 4.1] foo 10.0\n", - "10 [3.1, 4.1] foo 10.0\n", - "11 [3.1, 4.1] foo 10.0\n" - ] - }, - { - "ename": "OSError", - "evalue": "LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lancedb/table.py:872\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 872\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lance/dataset.py:596\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 595\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 596\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23" - ] - } - ], - "source": [ - "to_remove = [\"foo\", \"buzz\"]\n", - "to_remove = \", \".join(str(v) for v in to_remove)\n", - "print(tbl.to_pandas())\n", - "tbl.delete(f\"item IN ({to_remove})\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87d5bc21-847f-4c81-b56e-f6dbe5d05aac", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(\n", - " {\n", - " \"vector\": [[3.1, 4.1], [1, 1]],\n", - " \"item\": [\"foo\", \"bar\"],\n", - " \"price\": [10.0, 20.0],\n", - " })\n", - "\n", - "tbl = db.create_table(\"table7\", data=df, mode=\"overwrite\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cba4519-eb3a-4941-ab7e-873d762e750f", - "metadata": {}, - "outputs": [], - "source": [ - "to_remove = [10.0, 20.0]\n", - "to_remove = \", \".join(str(v) for v in to_remove)\n", - "\n", - "tbl.delete(f\"price IN ({to_remove})\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5bdc9801-d5ed-4871-92d0-88b27108e788", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vectoritemprice
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [vector, item, price]\n", - "Index: []" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tbl.to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "752d33d4-ce1c-48e5-90d2-c85f0982182d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/python/datafusion.md b/docs/src/python/datafusion.md deleted file mode 100644 index 6e8e4120..00000000 --- a/docs/src/python/datafusion.md +++ /dev/null @@ -1,53 +0,0 @@ -# Apache Datafusion - -In Python, LanceDB tables can also be queried with [Apache Datafusion](https://datafusion.apache.org/), an extensible query engine written in Rust that uses Apache Arrow as its in-memory format. This means you can write complex SQL queries to analyze your data in LanceDB. - -This integration is done via [Datafusion FFI](https://docs.rs/datafusion-ffi/latest/datafusion_ffi/), which provides a native integration between LanceDB and Datafusion. -The Datafusion FFI allows to pass down column selections and basic filters to LanceDB, reducing the amount of scanned data when executing your query. Additionally, the integration allows streaming data from LanceDB tables which allows to do aggregation larger-than-memory. - -We can demonstrate this by first installing `datafusion` and `lancedb`. - -```shell -pip install datafusion lancedb -``` - -We will re-use the dataset [created previously](./pandas_and_pyarrow.md): - -```python -import lancedb - -from datafusion import SessionContext -from lance import FFILanceTableProvider - -db = lancedb.connect("data/sample-lancedb") -data = [ - {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, - {"vector": [5.9, 26.5], "item": "bar", "price": 20.0} -] -lance_table = db.create_table("lance_table", data) - -ctx = SessionContext() - -ffi_lance_table = FFILanceTableProvider( - lance_table.to_lance(), with_row_id=True, with_row_addr=True -) -ctx.register_table_provider("ffi_lance_table", ffi_lance_table) -``` - -The `to_lance` method converts the LanceDB table to a `LanceDataset`, which is accessible to Datafusion through the Datafusion FFI integration layer. -To query the resulting Lance dataset in Datafusion, you first need to register the dataset with Datafusion and then just reference it by the same name in your SQL query. - -```python -ctx.table("ffi_lance_table") -ctx.sql("SELECT * FROM ffi_lance_table") -``` - -``` -┌─────────────┬─────────┬────────┬─────────────────┬─────────────────┐ -│ vector │ item │ price │ _rowid │ _rowaddr │ -│ float[] │ varchar │ double │ bigint unsigned │ bigint unsigned │ -├─────────────┼─────────┼────────┼─────────────────┼─────────────────┤ -│ [3.1, 4.1] │ foo │ 10.0 │ 0 │ 0 │ -│ [5.9, 26.5] │ bar │ 20.0 │ 1 │ 1 │ -└─────────────┴─────────┴────────┴─────────────────┴─────────────────┘ -``` diff --git a/docs/src/python/duckdb.md b/docs/src/python/duckdb.md deleted file mode 100644 index 08ef66c5..00000000 --- a/docs/src/python/duckdb.md +++ /dev/null @@ -1,61 +0,0 @@ -# DuckDB - -In Python, LanceDB tables can also be queried with [DuckDB](https://duckdb.org/), an in-process SQL OLAP database. This means you can write complex SQL queries to analyze your data in LanceDB. - -This integration is done via [Apache Arrow](https://duckdb.org/docs/guides/python/sql_on_arrow), which provides zero-copy data sharing between LanceDB and DuckDB. DuckDB is capable of passing down column selections and basic filters to LanceDB, reducing the amount of data that needs to be scanned to perform your query. Finally, the integration allows streaming data from LanceDB tables, allowing you to aggregate tables that won't fit into memory. All of this uses the same mechanism described in DuckDB's blog post *[DuckDB quacks Arrow](https://duckdb.org/2021/12/03/duck-arrow.html)*. - - -We can demonstrate this by first installing `duckdb` and `lancedb`. - -```shell -pip install duckdb lancedb -``` - -We will re-use the dataset [created previously](./pandas_and_pyarrow.md): - -```python -import lancedb - -db = lancedb.connect("data/sample-lancedb") -data = [ - {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, - {"vector": [5.9, 26.5], "item": "bar", "price": 20.0} -] -table = db.create_table("pd_table", data=data) -``` - -The `to_lance` method converts the LanceDB table to a `LanceDataset`, which is accessible to DuckDB through the Arrow compatibility layer. -To query the resulting Lance dataset in DuckDB, all you need to do is reference the dataset by the same name in your SQL query. - -```python -import duckdb - -arrow_table = table.to_lance() - -duckdb.query("SELECT * FROM arrow_table") -``` - -``` -┌─────────────┬─────────┬────────┐ -│ vector │ item │ price │ -│ float[] │ varchar │ double │ -├─────────────┼─────────┼────────┤ -│ [3.1, 4.1] │ foo │ 10.0 │ -│ [5.9, 26.5] │ bar │ 20.0 │ -└─────────────┴─────────┴────────┘ -``` - -You can very easily run any other DuckDB SQL queries on your data. - -```py -duckdb.query("SELECT mean(price) FROM arrow_table") -``` - -``` -┌─────────────┐ -│ mean(price) │ -│ double │ -├─────────────┤ -│ 15.0 │ -└─────────────┘ -``` \ No newline at end of file diff --git a/docs/src/python/pandas_and_pyarrow.md b/docs/src/python/pandas_and_pyarrow.md deleted file mode 100644 index c597becf..00000000 --- a/docs/src/python/pandas_and_pyarrow.md +++ /dev/null @@ -1,97 +0,0 @@ -# Pandas and PyArrow - -Because Lance is built on top of [Apache Arrow](https://arrow.apache.org/), -LanceDB is tightly integrated with the Python data ecosystem, including [Pandas](https://pandas.pydata.org/) -and PyArrow. The sequence of steps in a typical workflow is shown below. - -## Create dataset - -First, we need to connect to a LanceDB database. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:import-lancedb" - --8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:import-lancedb" - --8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb_async" - ``` - -We can load a Pandas `DataFrame` to LanceDB directly. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:import-pandas" - --8<-- "python/python/tests/docs/test_python.py:create_table_pandas" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:import-pandas" - --8<-- "python/python/tests/docs/test_python.py:create_table_pandas_async" - ``` - -Similar to the [`pyarrow.write_dataset()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html) method, LanceDB's -[`db.create_table()`](python.md/#lancedb.db.DBConnection.create_table) accepts data in a variety of forms. - -If you have a dataset that is larger than memory, you can create a table with `Iterator[pyarrow.RecordBatch]` to lazily load the data: - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:import-iterable" - --8<-- "python/python/tests/docs/test_python.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_python.py:make_batches" - --8<-- "python/python/tests/docs/test_python.py:create_table_iterable" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:import-iterable" - --8<-- "python/python/tests/docs/test_python.py:import-pyarrow" - --8<-- "python/python/tests/docs/test_python.py:make_batches" - --8<-- "python/python/tests/docs/test_python.py:create_table_iterable_async" - ``` - -You will find detailed instructions of creating a LanceDB dataset in -[Getting Started](../basic.md#quick-start) and [API](python.md/#lancedb.db.DBConnection.create_table) -sections. - -## Vector search - -We can now perform similarity search via the LanceDB Python API. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:vector_search" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:vector_search_async" - ``` - -``` - vector item price _distance -0 [5.9, 26.5] bar 20.0 14257.05957 -``` - -If you have a simple filter, it's faster to provide a `where` clause to LanceDB's `search` method. -For more complex filters or aggregations, you can always resort to using the underlying `DataFrame` methods after performing a search. - -=== "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:vector_search_with_filter" - ``` -=== "Async API" - - ```python - --8<-- "python/python/tests/docs/test_python.py:vector_search_with_filter_async" - ``` diff --git a/docs/src/python/polars_arrow.md b/docs/src/python/polars_arrow.md deleted file mode 100644 index 51a281a3..00000000 --- a/docs/src/python/polars_arrow.md +++ /dev/null @@ -1,141 +0,0 @@ -# Polars - -LanceDB supports [Polars](https://github.com/pola-rs/polars), a blazingly fast DataFrame library for Python written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow under the hood. A deeper integration between Lance Tables and Polars DataFrames is in progress, but at the moment, you can read a Polars DataFrame into LanceDB and output the search results from a query to a Polars DataFrame. - - -## Create & Query LanceDB Table - -### From Polars DataFrame - -First, we connect to a LanceDB database. - -=== "Sync API" - - ```py - --8<-- "python/python/tests/docs/test_python.py:import-lancedb" - --8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb" - ``` - -=== "Async API" - - ```py - --8<-- "python/python/tests/docs/test_python.py:import-lancedb" - --8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb_async" - ``` - - -We can load a Polars `DataFrame` to LanceDB directly. - -=== "Sync API" - - ```py - --8<-- "python/python/tests/docs/test_python.py:import-polars" - --8<-- "python/python/tests/docs/test_python.py:create_table_polars" - ``` - -=== "Async API" - - ```py - --8<-- "python/python/tests/docs/test_python.py:import-polars" - --8<-- "python/python/tests/docs/test_python.py:create_table_polars_async" - ``` - -We can now perform similarity search via the LanceDB Python API. - -=== "Sync API" - - ```py - --8<-- "python/python/tests/docs/test_python.py:vector_search_polars" - ``` - -=== "Async API" - - ```py - --8<-- "python/python/tests/docs/test_python.py:vector_search_polars_async" - ``` - -In addition to the selected columns, LanceDB also returns a vector -and also the `_distance` column which is the distance between the query -vector and the returned vector. - -``` -shape: (1, 4) -┌───────────────┬──────┬───────┬───────────┐ -│ vector ┆ item ┆ price ┆ _distance │ -│ --- ┆ --- ┆ --- ┆ --- │ -│ array[f32, 2] ┆ str ┆ f64 ┆ f32 │ -╞═══════════════╪══════╪═══════╪═══════════╡ -│ [3.1, 4.1] ┆ foo ┆ 10.0 ┆ 0.0 │ -└───────────────┴──────┴───────┴───────────┘ - -``` - -Note that the type of the result from a table search is a Polars DataFrame. - -### From Pydantic Models - -Alternately, we can create an empty LanceDB Table using a Pydantic schema and populate it with a Polars DataFrame. - -```py ---8<-- "python/python/tests/docs/test_python.py:import-polars" ---8<-- "python/python/tests/docs/test_python.py:import-lancedb-pydantic" ---8<-- "python/python/tests/docs/test_python.py:class_Item" ---8<-- "python/python/tests/docs/test_python.py:create_table_pydantic" -``` - -The table can now be queried as usual. - -```py ---8<-- "python/python/tests/docs/test_python.py:vector_search_polars" -``` - -``` -shape: (1, 4) -┌───────────────┬──────┬───────┬───────────┐ -│ vector ┆ item ┆ price ┆ _distance │ -│ --- ┆ --- ┆ --- ┆ --- │ -│ array[f32, 2] ┆ str ┆ f64 ┆ f32 │ -╞═══════════════╪══════╪═══════╪═══════════╡ -│ [3.1, 4.1] ┆ foo ┆ 10.0 ┆ 0.02 │ -└───────────────┴──────┴───────┴───────────┘ - -``` - -This result is the same as the previous one, with a DataFrame returned. - -## Dump Table to LazyFrame - -As you iterate on your application, you'll likely need to work with the whole table's data pretty frequently. -LanceDB tables can also be converted directly into a polars LazyFrame for further processing. - -```python ---8<-- "python/python/tests/docs/test_python.py:dump_table_lazyform" -``` - -Unlike the search result from a query, we can see that the type of the result is a LazyFrame. - -``` - -``` - -We can now work with the LazyFrame as we would in Polars, and collect the first result. - -```python ---8<-- "python/python/tests/docs/test_python.py:print_table_lazyform" -``` - -``` -shape: (1, 3) -┌───────────────┬──────┬───────┐ -│ vector ┆ item ┆ price │ -│ --- ┆ --- ┆ --- │ -│ array[f32, 2] ┆ str ┆ f64 │ -╞═══════════════╪══════╪═══════╡ -│ [3.1, 4.1] ┆ foo ┆ 10.0 │ -└───────────────┴──────┴───────┘ -``` - -The reason it's beneficial to not convert the LanceDB Table -to a DataFrame is because the table can potentially be way larger -than memory, and Polars LazyFrames allow us to work with such -larger-than-memory datasets by not loading it into memory all at once. diff --git a/docs/src/python/pydantic.md b/docs/src/python/pydantic.md deleted file mode 100644 index bdd521ea..00000000 --- a/docs/src/python/pydantic.md +++ /dev/null @@ -1,47 +0,0 @@ -# Pydantic - -[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python. -LanceDB integrates with Pydantic for schema inference, data ingestion, and query result casting. -Using [LanceModel][lancedb.pydantic.LanceModel], users can seamlessly -integrate Pydantic with the rest of the LanceDB APIs. - -```python - ---8<-- "python/python/tests/docs/test_pydantic_integration.py:imports" - ---8<-- "python/python/tests/docs/test_pydantic_integration.py:base_model" - ---8<-- "python/python/tests/docs/test_pydantic_integration.py:set_url" ---8<-- "python/python/tests/docs/test_pydantic_integration.py:base_example" -``` - - -## Vector Field - -LanceDB provides a [`Vector(dim)`](python.md#lancedb.pydantic.Vector) method to define a -vector Field in a Pydantic Model. - -::: lancedb.pydantic.Vector - -## Type Conversion - -LanceDB automatically convert Pydantic fields to -[Apache Arrow DataType](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType). - -Current supported type conversions: - -| Pydantic Field Type | PyArrow Data Type | -| ------------------- | ----------------- | -| `int` | `pyarrow.int64` | -| `float` | `pyarrow.float64` | -| `bool` | `pyarrow.bool` | -| `str` | `pyarrow.utf8()` | -| `list` | `pyarrow.List` | -| `BaseModel` | `pyarrow.Struct` | -| `Vector(n)` | `pyarrow.FixedSizeList(float32, n)` | - -LanceDB supports to create Apache Arrow Schema from a -[Pydantic BaseModel][pydantic.BaseModel] -via [pydantic_to_schema()](python.md#lancedb.pydantic.pydantic_to_schema) method. - -::: lancedb.pydantic.pydantic_to_schema diff --git a/docs/src/python/python.md b/docs/src/python/python.md index 8e2c456b..a5ce089e 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -1,7 +1,6 @@ # Python API Reference -This section contains the API reference for the Python API. There is a -synchronous and an asynchronous API client. +This section contains the API reference for the Python API of [LanceDB](https://github.com/lancedb/lancedb). Both synchronous and asynchronous APIs are available. The general flow of using the API is: diff --git a/docs/src/python/saas-python.md b/docs/src/python/saas-python.md deleted file mode 100644 index 9e549411..00000000 --- a/docs/src/python/saas-python.md +++ /dev/null @@ -1,24 +0,0 @@ -# Python API Reference (SaaS) - -This section contains the API reference for the LanceDB Cloud Python API. - -## Installation - -```shell -pip install lancedb -``` - -## Connection - -::: lancedb.connect - -::: lancedb.remote.db.RemoteDBConnection - -## Table - -::: lancedb.remote.table.RemoteTable - options: - filters: - - "!cleanup_old_versions" - - "!compact_files" - - "!optimize" diff --git a/docs/src/rag/adaptive_rag.md b/docs/src/rag/adaptive_rag.md deleted file mode 100644 index f93ff980..00000000 --- a/docs/src/rag/adaptive_rag.md +++ /dev/null @@ -1,51 +0,0 @@ -**Adaptive RAG 🤹‍♂️** -==================================================================== -Adaptive RAG introduces a RAG technique that combines query analysis with self-corrective RAG. - -For Query Analysis, it uses a small classifier(LLM), to decide the query’s complexity. Query Analysis guides adjustment between different retrieval strategies: No retrieval, Single-shot RAG or Iterative RAG. - -**[Official Paper](https://arxiv.org/pdf/2403.14403)** - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/adaptive_rag.png) -
Adaptive-RAG: Source -
-
- -**[Official Implementation](https://github.com/starsuzi/Adaptive-RAG)** - -Here’s a code snippet for query analysis: - -```python -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.pydantic_v1 import BaseModel, Field -from langchain_openai import ChatOpenAI - -class RouteQuery(BaseModel): - """Route a user query to the most relevant datasource.""" - - datasource: Literal["vectorstore", "web_search"] = Field( - ..., - description="Given a user question choose to route it to web search or a vectorstore.", - ) - - -# LLM with function call -llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0) -structured_llm_router = llm.with_structured_output(RouteQuery) -``` - -The following example defines and queries a retriever: - -```python -# add documents in LanceDB -vectorstore = LanceDB.from_documents( - documents=doc_splits, - embedding=OpenAIEmbeddings(), -) -retriever = vectorstore.as_retriever() - -# query using defined retriever -question = "How adaptive RAG works" -docs = retriever.get_relevant_documents(question) -``` diff --git a/docs/src/rag/advanced_techniques/flare.md b/docs/src/rag/advanced_techniques/flare.md deleted file mode 100644 index 5f2fd24a..00000000 --- a/docs/src/rag/advanced_techniques/flare.md +++ /dev/null @@ -1,38 +0,0 @@ -**FLARE 💥** -==================================================================== -FLARE, stands for Forward-Looking Active REtrieval augmented generation is a generic retrieval-augmented generation method that actively decides when and what to retrieve using a prediction of the upcoming sentence to anticipate future content and utilize it as the query to retrieve relevant documents if it contains low-confidence tokens. - -**[Official Paper](https://arxiv.org/abs/2305.06983)** - -
- ![flare](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/flare.gif) -
FLARE: Source
-
- -[![Open In Colab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/better-rag-FLAIR/main.ipynb) - -Here’s a code snippet for using FLARE with Langchain: - -```python -from langchain.vectorstores import LanceDB -from langchain.document_loaders import ArxivLoader -from langchain.chains import FlareChain -from langchain.prompts import PromptTemplate -from langchain.chains import LLMChain -from langchain.llms import OpenAI - -llm = OpenAI() - -# load dataset - -# LanceDB retriever -vector_store = LanceDB.from_documents(doc_chunks, embeddings, connection=table) -retriever = vector_store.as_retriever() - -# define flare chain -flare = FlareChain.from_llm(llm=llm,retriever=vector_store_retriever,max_generation_len=300,min_prob=0.45) - -result = flare.run(input_text) -``` - -[![Open In Colab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/better-rag-FLAIR/main.ipynb) diff --git a/docs/src/rag/advanced_techniques/hyde.md b/docs/src/rag/advanced_techniques/hyde.md deleted file mode 100644 index a91008b7..00000000 --- a/docs/src/rag/advanced_techniques/hyde.md +++ /dev/null @@ -1,55 +0,0 @@ -**HyDE: Hypothetical Document Embeddings 🤹‍♂️** -==================================================================== -HyDE, stands for Hypothetical Document Embeddings is an approach used for precise zero-shot dense retrieval without relevance labels. It focuses on augmenting and improving similarity searches, often intertwined with vector stores in information retrieval. The method generates a hypothetical document for an incoming query, which is then embedded and used to look up real documents that are similar to the hypothetical document. - -**[Official Paper](https://arxiv.org/pdf/2212.10496)** - -
- ![hyde](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/hyde.png) -
HyDE: Source
-
- -[![Open In Colab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Advance-RAG-with-HyDE/main.ipynb) - -Here’s a code snippet for using HyDE with Langchain: - -```python -from langchain.llms import OpenAI -from langchain.embeddings import OpenAIEmbeddings -from langchain.prompts import PromptTemplate -from langchain.chains import LLMChain, HypotheticalDocumentEmbedder -from langchain.vectorstores import LanceDB - -# set OPENAI_API_KEY as env variable before this step -# initialize LLM and embedding function -llm = OpenAI() -emebeddings = OpenAIEmbeddings() - -# HyDE embedding -embeddings = HypotheticalDocumentEmbedder(llm_chain=llm_chain,base_embeddings=embeddings) - -# load dataset - -# LanceDB retriever -retriever = LanceDB.from_documents(documents, embeddings, connection=table) - -# prompt template -prompt_template = """ -As a knowledgeable and helpful research assistant, your task is to provide informative answers based on the given context. Use your extensive knowledge base to offer clear, concise, and accurate responses to the user's inquiries. -if quetion is not related to documents simply say you dont know -Question: {question} - -Answer: -""" - -prompt = PromptTemplate(input_variables=["question"], template=prompt_template) - -# LLM Chain -llm_chain = LLMChain(llm=llm, prompt=prompt) - -# vector search -retriever.similarity_search(query) -llm_chain.run(query) -``` - -[![Open In Colab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Advance-RAG-with-HyDE/main.ipynb) diff --git a/docs/src/rag/agentic_rag.md b/docs/src/rag/agentic_rag.md deleted file mode 100644 index 0ce3152a..00000000 --- a/docs/src/rag/agentic_rag.md +++ /dev/null @@ -1,101 +0,0 @@ -**Agentic RAG 🤖** -==================================================================== -Agentic RAG introduces an advanced framework for answering questions by using intelligent agents instead of just relying on large language models. These agents act like expert researchers, handling complex tasks such as detailed planning, multi-step reasoning, and using external tools. They navigate multiple documents, compare information, and generate accurate answers. This system is easily scalable, with each new document set managed by a sub-agent, making it a powerful tool for tackling a wide range of information needs. - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/agentic_rag.png) -
Agent-based RAG
-
- -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Agentic_RAG/main.ipynb) - -Here’s a code snippet for defining retriever using Langchain: - -```python -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.document_loaders import WebBaseLoader -from langchain_community.vectorstores import LanceDB -from langchain_openai import OpenAIEmbeddings - -urls = [ - "https://content.dgft.gov.in/Website/CIEP.pdf", - "https://content.dgft.gov.in/Website/GAE.pdf", - "https://content.dgft.gov.in/Website/HTE.pdf", -] - - -docs = [WebBaseLoader(url).load() for url in urls] -docs_list = [item for sublist in docs for item in sublist] - -text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=100, chunk_overlap=50 -) -doc_splits = text_splitter.split_documents(docs_list) - -# add documents in LanceDB -vectorstore = LanceDB.from_documents( - documents=doc_splits, - embedding=OpenAIEmbeddings(), -) -retriever = vectorstore.as_retriever() - -``` - -Here is an agent that formulates an improved query for better retrieval results and then grades the retrieved documents: - -```python -def grade_documents(state) -> Literal["generate", "rewrite"]: - class grade(BaseModel): - binary_score: str = Field(description="Relevance score 'yes' or 'no'") - - model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) - llm_with_tool = model.with_structured_output(grade) - prompt = PromptTemplate( - template="""You are a grader assessing relevance of a retrieved document to a user question. \n - Here is the retrieved document: \n\n {context} \n\n - Here is the user question: {question} \n - If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n - Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""", - input_variables=["context", "question"], - ) - chain = prompt | llm_with_tool - - messages = state["messages"] - last_message = messages[-1] - question = messages[0].content - docs = last_message.content - - scored_result = chain.invoke({"question": question, "context": docs}) - score = scored_result.binary_score - - return "generate" if score == "yes" else "rewrite" - - -def agent(state): - messages = state["messages"] - model = ChatOpenAI(temperature=0, streaming=True, model="gpt-4-turbo") - model = model.bind_tools(tools) - response = model.invoke(messages) - return {"messages": [response]} - - -def rewrite(state): - messages = state["messages"] - question = messages[0].content - msg = [ - HumanMessage( - content=f""" \n - Look at the input and try to reason about the underlying semantic intent / meaning. \n - Here is the initial question: - \n ------- \n - {question} - \n ------- \n - Formulate an improved question: """, - ) - ] - model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) - response = model.invoke(msg) - return {"messages": [response]} -``` - -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Agentic_RAG/main.ipynb) diff --git a/docs/src/rag/corrective_rag.md b/docs/src/rag/corrective_rag.md deleted file mode 100644 index a6a53eca..00000000 --- a/docs/src/rag/corrective_rag.md +++ /dev/null @@ -1,120 +0,0 @@ -**Corrective RAG ✅** -==================================================================== - -Corrective-RAG (CRAG) is a strategy for Retrieval-Augmented Generation (RAG) that includes self-reflection and self-grading of retrieved documents. Here’s a simplified breakdown of the steps involved: - -1. **Relevance Check**: If at least one document meets the relevance threshold, the process moves forward to the generation phase. -2. **Knowledge Refinement**: Before generating an answer, the process refines the knowledge by dividing the document into smaller segments called "knowledge strips". -3. **Grading and Filtering**: Each "knowledge strip" is graded, and irrelevant ones are filtered out. -4. **Additional Data Source**: If all documents are below the relevance threshold, or if the system is unsure about their relevance, it will seek additional information by performing a web search to supplement the retrieved data. - -Above steps are mentioned in -**[Official Paper](https://arxiv.org/abs/2401.15884)** - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/crag_paper.png) -
Corrective RAG: Source -
-
- -Corrective Retrieval-Augmented Generation (CRAG) is a method that works like a **built-in fact-checker**. - -**[Official Implementation](https://github.com/HuskyInSalt/CRAG)** - -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Corrective-RAG-with_Langgraph/CRAG_with_Langgraph.ipynb) - -Here’s a code snippet for defining a table with the [Embedding API](https://lancedb.github.io/lancedb/embeddings/embedding_functions/), and retrieves the relevant documents: - -```python -import pandas as pd -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") - -class Docs(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -table = db.create_table("docs", schema=Docs) - -# considering chunks are in list format -df = pd.DataFrame({'text':chunks}) -table.add(data=df) - -# as per document feeded -query = "How Transformers work?" -actual = table.search(query).limit(1).to_list()[0] -print(actual.text) -``` - -Code snippet for grading retrieved documents, filtering out irrelevant ones, and performing a web search if necessary: - -```python -def grade_documents(state): - """ - Determines whether the retrieved documents are relevant to the question - - Args: - state (dict): The current graph state - - Returns: - state (dict): Updates documents key with relevant documents - """ - - state_dict = state["keys"] - question = state_dict["question"] - documents = state_dict["documents"] - - class grade(BaseModel): - """ - Binary score for relevance check - """ - - binary_score: str = Field(description="Relevance score 'yes' or 'no'") - - model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) - # grading using openai - grade_tool_oai = convert_to_openai_tool(grade) - llm_with_tool = model.bind( - tools=[convert_to_openai_tool(grade_tool_oai)], - tool_choice={"type": "function", "function": {"name": "grade"}}, - ) - - parser_tool = PydanticToolsParser(tools=[grade]) - prompt = PromptTemplate( - template="""You are a grader assessing relevance of a retrieved document to a user question. \n - Here is the retrieved document: \n\n {context} \n\n - Here is the user question: {question} \n - If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n - Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""", - input_variables=["context", "question"], - ) - - chain = prompt | llm_with_tool | parser_tool - - filtered_docs = [] - search = "No" - for d in documents: - score = chain.invoke({"question": question, "context": d.page_content}) - grade = score[0].binary_score - if grade == "yes": - filtered_docs.append(d) - else: - search = "Yes" - continue - - return { - "keys": { - "documents": filtered_docs, - "question": question, - "run_web_search": search, - } - } -``` - -Check Colab for the Implementation of CRAG with Langgraph: - -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Corrective-RAG-with_Langgraph/CRAG_with_Langgraph.ipynb) diff --git a/docs/src/rag/graph_rag.md b/docs/src/rag/graph_rag.md deleted file mode 100644 index ff099c8a..00000000 --- a/docs/src/rag/graph_rag.md +++ /dev/null @@ -1,54 +0,0 @@ -**Graph RAG 📊** -==================================================================== -Graph RAG uses knowledge graphs together with large language models (LLMs) to improve how information is retrieved and generated. It overcomes the limits of traditional search methods by using knowledge graphs, which organize data as connected entities and relationships. - -One of the main benefits of Graph RAG is its ability to capture and represent complex relationships between entities, something that traditional text-based retrieval systems struggle with. By using this structured knowledge, LLMs can better grasp the context and details of a query, resulting in more accurate and insightful answers. - -**[Official Paper](https://arxiv.org/pdf/2404.16130)** - -**[Official Implementation](https://github.com/microsoft/graphrag)** - -[Microsoft Research Blog](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/) - -!!! note "Default VectorDB" - - Graph RAG uses LanceDB as the default vector database for performing vector search to retrieve relevant entities. - -Working with Graph RAG is quite straightforward - -- **Installation and API KEY as env variable** - -Set `OPENAI_API_KEY` as `GRAPHRAG_API_KEY` - -```bash -pip install graphrag -export GRAPHRAG_API_KEY="sk-..." -``` - -- **Initial structure for indexing dataset** - -```bash -python3 -m graphrag.index --init --root dataset-dir -``` - -- **Index Dataset** - -```bash -python3 -m graphrag.index --root dataset-dir -``` - -- **Execute Query** - -Global Query Execution gives a broad overview of dataset: - -```bash -python3 -m graphrag.query --root dataset-dir --method global "query-question" -``` - -Local Query Execution gives a detailed and specific answers based on the context of the entities: - -```bash -python3 -m graphrag.query --root dataset-dir --method local "query-question" -``` - -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Graphrag/main.ipynb) diff --git a/docs/src/rag/multi_head_rag.md b/docs/src/rag/multi_head_rag.md deleted file mode 100644 index 74863420..00000000 --- a/docs/src/rag/multi_head_rag.md +++ /dev/null @@ -1,49 +0,0 @@ -**Multi-Head RAG 📃** -==================================================================== - -Multi-head RAG (MRAG) is designed to handle queries that need multiple documents with diverse content. These queries are tough because the documents’ embeddings can be far apart, making retrieval difficult. MRAG simplifies this by using the activations from a Transformer's multi-head attention layer, rather than the decoder layer, to fetch these varied documents. Different attention heads capture different aspects of the data, so using these activations helps create embeddings that better represent various data facets and improves retrieval accuracy for complex queries. - -**[Official Paper](https://arxiv.org/pdf/2406.05085)** - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/mrag-paper.png) -
Multi-Head RAG: Source -
-
- -MRAG is cost-effective and energy-efficient because it avoids extra LLM queries, multiple model instances, increased storage, and additional inference passes. - -**[Official Implementation](https://github.com/spcl/MRAG)** - -Here’s a code snippet for defining different embedding spaces with the [Embedding API](https://lancedb.github.io/lancedb/embeddings/embedding_functions/): - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -# model definition using LanceDB Embedding API -model1 = get_registry().get("openai").create() -model2 = get_registry().get("ollama").create(name="llama3") -model3 = get_registry().get("ollama").create(name="mistral") - - -# define schema for creating embedding spaces with Embedding API -class Space1(LanceModel): - text: str = model1.SourceField() - vector: Vector(model1.ndims()) = model1.VectorField() - - -class Space2(LanceModel): - text: str = model2.SourceField() - vector: Vector(model2.ndims()) = model2.VectorField() - - -class Space3(LanceModel): - text: str = model3.SourceField() - vector: Vector(model3.ndims()) = model3.VectorField() -``` - -Create different tables using defined embedding spaces, then make queries to each embedding space. Use the resulting closest documents from each embedding space to generate answers. - - diff --git a/docs/src/rag/self_rag.md b/docs/src/rag/self_rag.md deleted file mode 100644 index aaa09805..00000000 --- a/docs/src/rag/self_rag.md +++ /dev/null @@ -1,96 +0,0 @@ -**Self RAG 🤳** -==================================================================== -Self-RAG is a strategy for Retrieval-Augmented Generation (RAG) to get better retrieved information, generated text, and validation, without loss of flexibility. Unlike the traditional Retrieval-Augmented Generation (RAG) method, Self-RAG retrieves information as needed, can skip retrieval if not needed, and evaluates its own output while generating text. It also uses a process to pick the best output based on different preferences. - -**[Official Paper](https://arxiv.org/pdf/2310.11511)** - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/self_rag.png) -
Self RAG: Source -
-
- -**[Official Implementation](https://github.com/AkariAsai/self-rag)** - -Self-RAG starts by generating a response without retrieving extra info if it's not needed. For questions that need more details, it retrieves to get the necessary information. - -Here’s a code snippet for defining retriever using Langchain: - -```python -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.document_loaders import WebBaseLoader -from langchain_community.vectorstores import LanceDB -from langchain_openai import OpenAIEmbeddings - -urls = [ - "https://lilianweng.github.io/posts/2023-06-23-agent/", - "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/", - "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/", -] - - -docs = [WebBaseLoader(url).load() for url in urls] -docs_list = [item for sublist in docs for item in sublist] - -text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=100, chunk_overlap=50 -) -doc_splits = text_splitter.split_documents(docs_list) - -# add documents in LanceDB -vectorstore = LanceDB.from_documents( - documents=doc_splits, - embedding=OpenAIEmbeddings(), -) -retriever = vectorstore.as_retriever() - -``` - -The following functions grade the retrieved documents and formulate an improved query for better retrieval results, if required: - -```python -def grade_documents(state) -> Literal["generate", "rewrite"]: - class grade(BaseModel): - binary_score: str = Field(description="Relevance score 'yes' or 'no'") - - model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) - llm_with_tool = model.with_structured_output(grade) - prompt = PromptTemplate( - template="""You are a grader assessing relevance of a retrieved document to a user question. \n - Here is the retrieved document: \n\n {context} \n\n - Here is the user question: {question} \n - If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n - Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""", - input_variables=["context", "question"], - ) - chain = prompt | llm_with_tool - - messages = state["messages"] - last_message = messages[-1] - question = messages[0].content - docs = last_message.content - - scored_result = chain.invoke({"question": question, "context": docs}) - score = scored_result.binary_score - - return "generate" if score == "yes" else "rewrite" - - -def rewrite(state): - messages = state["messages"] - question = messages[0].content - msg = [ - HumanMessage( - content=f""" \n - Look at the input and try to reason about the underlying semantic intent / meaning. \n - Here is the initial question: - \n ------- \n - {question} - \n ------- \n - Formulate an improved question: """, - ) - ] - model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) - response = model.invoke(msg) - return {"messages": [response]} -``` diff --git a/docs/src/rag/sfr_rag.md b/docs/src/rag/sfr_rag.md deleted file mode 100644 index c894480a..00000000 --- a/docs/src/rag/sfr_rag.md +++ /dev/null @@ -1,17 +0,0 @@ -**SFR RAG 📑** -==================================================================== -Salesforce AI Research introduced SFR-RAG, a 9-billion-parameter language model trained with a significant emphasis on reliable, precise, and faithful contextual generation abilities specific to real-world RAG use cases and relevant agentic tasks. It targets precise factual knowledge extraction, distinction between relevant and distracting contexts, citation of appropriate sources along with answers, production of complex and multi-hop reasoning over multiple contexts, consistent format following, as well as minimization of hallucination over unanswerable queries. - -**[Official Implementation](https://github.com/SalesforceAIResearch/SFR-RAG)** - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/salesforce_contextbench.png) -
Average Scores in ContextualBench: Source -
-
- -To reliably evaluate LLMs in contextual question-answering for RAG, Saleforce introduced [ContextualBench](https://huggingface.co/datasets/Salesforce/ContextualBench?ref=blog.salesforceairesearch.com), featuring 7 benchmarks like [HotpotQA](https://arxiv.org/abs/1809.09600?ref=blog.salesforceairesearch.com) and [2WikiHopQA](https://www.aclweb.org/anthology/2020.coling-main.580/?ref=blog.salesforceairesearch.com) with consistent setups. - -SFR-RAG outperforms GPT-4o, achieving state-of-the-art results in 3 out of 7 benchmarks, and significantly surpasses Command-R+ while using 10 times fewer parameters. It also excels at handling context, even when facts are altered or conflicting. - -[Saleforce AI Research Blog](https://blog.salesforceairesearch.com/sfr-rag/) diff --git a/docs/src/rag/vanilla_rag.md b/docs/src/rag/vanilla_rag.md deleted file mode 100644 index d2a9c464..00000000 --- a/docs/src/rag/vanilla_rag.md +++ /dev/null @@ -1,54 +0,0 @@ -**Vanilla RAG 🌱** -==================================================================== - -RAG(Retrieval-Augmented Generation) works by finding documents related to the user's question, combining them with a prompt for a large language model (LLM), and then using the LLM to create more accurate and relevant answers. - -Here’s a simple guide to building a RAG pipeline from scratch: - -1. **Data Loading**: Gather and load the documents you want to use for answering questions. - -2. **Chunking and Embedding**: Split the documents into smaller chunks and convert them into numerical vectors (embeddings) that capture their meaning. - -3. **Vector Store**: Create a LanceDB table to store and manage these vectors for quick access during retrieval. - -4. **Retrieval & Prompt Preparation**: When a question is asked, find the most relevant document chunks from the table and prepare a prompt combining these chunks with the question. - -5. **Answer Generation**: Send the prepared prompt to a LLM to generate a detailed and accurate answer. - -
- ![agent-based-rag](https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/rag/rag_from_scratch.png) -
Vanilla RAG -
-
- -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/RAG-from-Scratch/RAG_from_Scratch.ipynb) - -Here’s a code snippet for defining a table with the [Embedding API](https://lancedb.github.io/lancedb/embeddings/embedding_functions/), which simplifies the process by handling embedding extraction and querying in one step. - -```python -import pandas as pd -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") - -class Docs(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -table = db.create_table("docs", schema=Docs) - -# considering chunks are in list format -df = pd.DataFrame({'text':chunks}) -table.add(data=df) - -query = "What is issue date of lease?" -actual = table.search(query).limit(1).to_list()[0] -print(actual.text) -``` - -Check Colab for the complete code - -[![Open In Colab](../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/RAG-from-Scratch/RAG_from_Scratch.ipynb) \ No newline at end of file diff --git a/docs/src/reranking/answerdotai.md b/docs/src/reranking/answerdotai.md deleted file mode 100644 index 114e624e..00000000 --- a/docs/src/reranking/answerdotai.md +++ /dev/null @@ -1,73 +0,0 @@ -# AnswersDotAI Rerankers - -This integration uses [AnswersDotAI's rerankers](https://github.com/AnswerDotAI/rerankers) to rerank the search results, providing a lightweight, low-dependency, unified API to use all common reranking and cross-encoder models. - -!!! note - Supported Query Types: Hybrid, Vector, FTS - - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import AnswerdotaiRerankers - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = AnswerdotaiRerankers() - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_type` | `str` | `"colbert"` | The type of model to use. Supported model types can be found here: https://github.com/AnswerDotAI/rerankers. | -| `model_name` | `str` | `"answerdotai/answerai-colbert-small-v1"` | The name of the reranker model to use. | -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | - - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ❌ Not Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have vector(`_distance`) along with Hybrid Search score(`_relevance_score`). | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have FTS(`score`) along with Hybrid Search score(`_relevance_score`). | diff --git a/docs/src/reranking/cohere.md b/docs/src/reranking/cohere.md deleted file mode 100644 index 42f7042a..00000000 --- a/docs/src/reranking/cohere.md +++ /dev/null @@ -1,78 +0,0 @@ -# Cohere Reranker - -This reranker uses the [Cohere](https://cohere.ai/) API to rerank the search results. You can use this reranker by passing `CohereReranker()` to the `rerank()` method. Note that you'll either need to set the `COHERE_API_KEY` environment variable or pass the `api_key` argument to use this reranker. - - -!!! note - Supported Query Types: Hybrid, Vector, FTS - -```shell -pip install cohere -``` - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import CohereReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = CohereReranker(api_key="key") - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_name` | `str` | `"rerank-english-v2.0"` | The name of the reranker model to use. Available cohere models are: rerank-english-v2.0, rerank-multilingual-v2.0 | -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | -| `api_key` | `str` | `None` | The API key for the Cohere API. If not provided, the `COHERE_API_KEY` environment variable is used. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | - - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column | -| `all` | ❌ Not Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column | -| `all` | ✅ Supported | Results have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column | -| `all` | ✅ Supported | Results have FTS(`score`) along with Hybrid Search score(`_relevance_score`) | diff --git a/docs/src/reranking/colbert.md b/docs/src/reranking/colbert.md deleted file mode 100644 index 1e166d73..00000000 --- a/docs/src/reranking/colbert.md +++ /dev/null @@ -1,71 +0,0 @@ -# ColBERT Reranker - -This reranker uses ColBERT model to rerank the search results. You can use this reranker by passing `ColbertReranker()` to the `rerank()` method. -!!! note - Supported Query Types: Hybrid, Vector, FTS - - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import ColbertReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = ColbertReranker() - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_name` | `str` | `"colbert-ir/colbertv2.0"` | The name of the reranker model to use.| -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ❌ Not Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have vector(`_distance`) along with Hybrid Search score(`_relevance_score`). | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have FTS(`score`) along with Hybrid Search score(`_relevance_score`). | diff --git a/docs/src/reranking/cross_encoder.md b/docs/src/reranking/cross_encoder.md deleted file mode 100644 index 6061238d..00000000 --- a/docs/src/reranking/cross_encoder.md +++ /dev/null @@ -1,70 +0,0 @@ -# Cross Encoder Reranker - -This reranker uses Cross Encoder models from sentence-transformers to rerank the search results. You can use this reranker by passing `CrossEncoderReranker()` to the `rerank()` method. -!!! note - Supported Query Types: Hybrid, Vector, FTS - - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import CrossEncoderReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = CrossEncoderReranker() - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_name` | `str` | `""cross-encoder/ms-marco-TinyBERT-L-6"` | The name of the reranker model to use.| -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ❌ Not Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have vector(`_distance`) along with Hybrid Search score(`_relevance_score`). | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have FTS(`score`) along with Hybrid Search score(`_relevance_score`). | diff --git a/docs/src/reranking/custom_reranker.md b/docs/src/reranking/custom_reranker.md deleted file mode 100644 index bc5bcb50..00000000 --- a/docs/src/reranking/custom_reranker.md +++ /dev/null @@ -1,89 +0,0 @@ -## Building Custom Rerankers -You can build your own custom reranker by subclassing the `Reranker` class and implementing the `rerank_hybrid()` method. Optionally, you can also implement the `rerank_vector()` and `rerank_fts()` methods if you want to support reranking for vector and FTS search separately. - -The `Reranker` base interface comes with a `merge_results()` method that can be used to combine the results of semantic and full-text search. This is a vanilla merging algorithm that simply concatenates the results and removes the duplicates without taking the scores into consideration. It only keeps the first copy of the row encountered. This works well in cases that don't require the scores of semantic and full-text search to combine the results. If you want to use the scores or want to support `return_score="all"`, you'll need to implement your own merging algorithm. - -Here's an example of a custom reranker that combines the results of semantic and full-text search using a linear combination of the scores: - -```python - -from lancedb.rerankers import Reranker -import pyarrow as pa - -class MyReranker(Reranker): - def __init__(self, param1, param2, ..., return_score="relevance"): - super().__init__(return_score) - self.param1 = param1 - self.param2 = param2 - - def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table): - # Use the built-in merging function - combined_result = self.merge_results(vector_results, fts_results) - - # Do something with the combined results - # ... - - # Return the combined results - return combined_result - - def rerank_vector(self, query: str, vector_results: pa.Table): - # Do something with the vector results - # ... - - # Return the vector results - return vector_results - - def rerank_fts(self, query: str, fts_results: pa.Table): - # Do something with the FTS results - # ... - - # Return the FTS results - return fts_results - -``` - -### Example of a Custom Reranker -For the sake of simplicity let's build custom reranker that enhances the Cohere Reranker by accepting a filter query, and accepts other CohereReranker params as kwargs. - -```python - -from typing import List, Union -import pandas as pd -from lancedb.rerankers import CohereReranker - -class ModifiedCohereReranker(CohereReranker): - def __init__(self, filters: Union[str, List[str]], **kwargs): - super().__init__(**kwargs) - filters = filters if isinstance(filters, list) else [filters] - self.filters = filters - - def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table: - combined_result = super().rerank_hybrid(query, vector_results, fts_results) - df = combined_result.to_pandas() - for filter in self.filters: - df = df.query("not text.str.contains(@filter)") - - return pa.Table.from_pandas(df) - - def rerank_vector(self, query: str, vector_results: pa.Table)-> pa.Table: - vector_results = super().rerank_vector(query, vector_results) - df = vector_results.to_pandas() - for filter in self.filters: - df = df.query("not text.str.contains(@filter)") - - return pa.Table.from_pandas(df) - - def rerank_fts(self, query: str, fts_results: pa.Table)-> pa.Table: - fts_results = super().rerank_fts(query, fts_results) - df = fts_results.to_pandas() - for filter in self.filters: - df = df.query("not text.str.contains(@filter)") - - return pa.Table.from_pandas(df) - -``` - -!!! tip - The `vector_results` and `fts_results` are pyarrow tables. Lean more about pyarrow tables [here](https://arrow.apache.org/docs/python). It can be converted to other data types like pandas dataframe, pydict, pylist etc. - - For example, You can convert them to pandas dataframes using `to_pandas()` method and perform any operations you want. After you are done, you can convert the dataframe back to pyarrow table using `pa.Table.from_pandas()` method and return it. diff --git a/docs/src/reranking/index.md b/docs/src/reranking/index.md deleted file mode 100644 index e4320366..00000000 --- a/docs/src/reranking/index.md +++ /dev/null @@ -1,81 +0,0 @@ -Reranking is the process of reordering a list of items based on some criteria. In the context of search, reranking is used to reorder the search results returned by a search engine based on some criteria. This can be useful when the initial ranking of the search results is not satisfactory or when the user has provided additional information that can be used to improve the ranking of the search results. - -LanceDB comes with some built-in rerankers. Some of the rerankers that are available in LanceDB are: - -| Reranker | Description | Supported Query Types | -| --- | --- | --- | -| `LinearCombinationReranker` | Reranks search results based on a linear combination of FTS and vector search scores | Hybrid | -| `CohereReranker` | Uses cohere Reranker API to rerank results | Vector, FTS, Hybrid | -| `CrossEncoderReranker` | Uses a cross-encoder model to rerank search results | Vector, FTS, Hybrid | -| `ColbertReranker` | Uses a colbert model to rerank search results | Vector, FTS, Hybrid | -| `OpenaiReranker`(Experimental) | Uses OpenAI's chat model to rerank search results | Vector, FTS, Hybrid | -| `VoyageAIReranker` | Uses voyageai Reranker API to rerank results | Vector, FTS, Hybrid | - - -## Using a Reranker -Using rerankers is optional for vector and FTS. However, for hybrid search, rerankers are required. To use a reranker, you need to create an instance of the reranker and pass it to the `rerank` method of the query builder: - -```python -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import CohereReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", data) -reranker = CohereReranker(api_key="your_api_key") - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text") -result = tbl.search("hello", query_type="hybrid").rerank(reranker).to_list() -``` - -### Multi-vector reranking -Most rerankers support reranking based on multiple vectors. To rerank based on multiple vectors, you can pass a list of vectors to the `rerank` method. Here's an example of how to rerank based on multiple vector columns using the `CrossEncoderReranker`: - -```python -from lancedb.rerankers import CrossEncoderReranker - -reranker = CrossEncoderReranker() - -query = "hello" - -res1 = table.search(query, vector_column_name="vector").limit(3) -res2 = table.search(query, vector_column_name="text_vector").limit(3) -res3 = table.search(query, vector_column_name="meta_vector").limit(3) - -reranked = reranker.rerank_multivector([res1, res2, res3], deduplicate=True) -``` - -## Available Rerankers -LanceDB comes with the following built-in rerankers: - -- [Cohere Reranker](./cohere.md) -- [Cross Encoder Reranker](./cross_encoder.md) -- [ColBERT Reranker](./colbert.md) -- [OpenAI Reranker](./openai.md) -- [Linear Combination Reranker](./linear_combination.md) -- [Jina Reranker](./jina.md) -- [AnswerDotAI Rerankers](./answerdotai.md) -- [Reciprocal Rank Fusion Reranker](./rrf.md) -- [VoyageAI Reranker](./voyageai.md) - -## Creating Custom Rerankers - -LanceDB also you to create custom rerankers by extending the base `Reranker` class. The custom reranker should implement the `rerank` method that takes a list of search results and returns a reranked list of search results. This is covered in more detail in the [Creating Custom Rerankers](./custom_reranker.md) section. diff --git a/docs/src/reranking/jina.md b/docs/src/reranking/jina.md deleted file mode 100644 index b7dc8577..00000000 --- a/docs/src/reranking/jina.md +++ /dev/null @@ -1,78 +0,0 @@ -# Jina Reranker - -This reranker uses the [Jina](https://jina.ai/reranker/) API to rerank the search results. You can use this reranker by passing `JinaReranker()` to the `rerank()` method. Note that you'll either need to set the `JINA_API_KEY` environment variable or pass the `api_key` argument to use this reranker. - - -!!! note - Supported Query Types: Hybrid, Vector, FTS - - -```python -import os -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import JinaReranker - -os.environ['JINA_API_KEY'] = "jina_*" - - -embedder = get_registry().get("jina").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = JinaReranker(api_key="key") - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_name` | `str` | `"jina-reranker-v2-base-multilingual"` | The name of the reranker model to use. You can find the list of available models in https://jina.ai/reranker. | -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | -| `api_key` | `str` | `None` | The API key for the Jina API. If not provided, the `JINA_API_KEY` environment variable is used. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | - - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ❌ Not Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have vector(`_distance`) along with Hybrid Search score(`_relevance_score`). | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have FTS(`score`) along with Hybrid Search score(`_relevance_score`). | diff --git a/docs/src/reranking/linear_combination.md b/docs/src/reranking/linear_combination.md deleted file mode 100644 index bb91b368..00000000 --- a/docs/src/reranking/linear_combination.md +++ /dev/null @@ -1,55 +0,0 @@ -# Linear Combination Reranker - -!!! note - This is deprecated. It is recommended to use the `RRFReranker` instead, if you want to use a score-based reranker. - -The Linear Combination Reranker combines the results of semantic and full-text search using a linear combination of the scores. The weights for the linear combination can be specified, and defaults to 0.7, i.e, 70% weight for semantic search and 30% weight for full-text search. - -!!! note - Supported Query Types: Hybrid - - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import LinearCombinationReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = LinearCombinationReranker() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `weight` | `float` | `0.7` | The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. | - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column | -| `all` | ✅ Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_distance`) | diff --git a/docs/src/reranking/openai.md b/docs/src/reranking/openai.md deleted file mode 100644 index fbdd811c..00000000 --- a/docs/src/reranking/openai.md +++ /dev/null @@ -1,73 +0,0 @@ -# OpenAI Reranker (Experimental) - -This reranker uses OpenAI chat model to rerank the search results. You can use this reranker by passing `OpenAI()` to the `rerank()` method. -!!! note - Supported Query Types: Hybrid, Vector, FTS - -!!! warning - This reranker is experimental. OpenAI doesn't have a dedicated reranking model, so we are using the chat model for reranking. - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import OpenaiReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = OpenaiReranker() - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_name` | `str` | `"gpt-4-turbo-preview"` | The name of the reranker model to use.| -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | -| `api_key` | str | `None` | The API key to use. If None, will use the OPENAI_API_KEY environment variable. - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ❌ Not Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have vector(`_distance`) along with Hybrid Search score(`_relevance_score`). | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | -| `all` | ✅ Supported | Results have FTS(`score`) along with Hybrid Search score(`_relevance_score`). | diff --git a/docs/src/reranking/rrf.md b/docs/src/reranking/rrf.md deleted file mode 100644 index 34c4f0f2..00000000 --- a/docs/src/reranking/rrf.md +++ /dev/null @@ -1,53 +0,0 @@ -# Reciprocal Rank Fusion Reranker - -This is the default reranker used by LanceDB hybrid search. Reciprocal Rank Fusion (RRF) is an algorithm that evaluates the search scores by leveraging the positions/rank of the documents. The implementation follows this [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf). - - -!!! note - Supported Query Types: Hybrid - - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import RRFReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = RRFReranker() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `K` | `int` | `60` | A constant used in the RRF formula (default is 60). Experiments indicate that k = 60 was near-optimal, but that the choice is not critical. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. | - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Returned rows only have the `_relevance_score` column. | -| `all` | ✅ Supported | Returned rows have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | diff --git a/docs/src/reranking/voyageai.md b/docs/src/reranking/voyageai.md deleted file mode 100644 index 2091ef9e..00000000 --- a/docs/src/reranking/voyageai.md +++ /dev/null @@ -1,77 +0,0 @@ -# Voyage AI Reranker - -Voyage AI provides cutting-edge embedding and rerankers. - -This reranker uses the [VoyageAI](https://docs.voyageai.com/docs/) API to rerank the search results. You can use this reranker by passing `VoyageAIReranker()` to the `rerank()` method. Note that you'll either need to set the `VOYAGE_API_KEY` environment variable or pass the `api_key` argument to use this reranker. - - -!!! note - Supported Query Types: Hybrid, Vector, FTS - - -```python -import numpy -import lancedb -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector -from lancedb.rerankers import VoyageAIReranker - -embedder = get_registry().get("sentence-transformers").create() -db = lancedb.connect("~/.lancedb") - -class Schema(LanceModel): - text: str = embedder.SourceField() - vector: Vector(embedder.ndims()) = embedder.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -tbl = db.create_table("test", schema=Schema, mode="overwrite") -tbl.add(data) -reranker = VoyageAIReranker(model_name="rerank-2") - -# Run vector search with a reranker -result = tbl.search("hello").rerank(reranker=reranker).to_list() - -# Run FTS search with a reranker -result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() - -# Run hybrid search with a reranker -tbl.create_fts_index("text", replace=True) -result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() - -``` - -Accepted Arguments ----------------- -| Argument | Type | Default | Description | -| --- | --- | --- | --- | -| `model_name` | `str` | `None` | The name of the reranker model to use. Available models are: rerank-2, rerank-2-lite | -| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | -| `api_key` | `str` | `None` | The API key for the Voyage AI API. If not provided, the `VOYAGE_API_KEY` environment variable is used. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | -| `truncation` | `bool` | `None` | Whether to truncate the input to satisfy the "context length limit" on the query and the documents. | - - -## Supported Scores for each query type -You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: - -### Hybrid Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column | -| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) | - -### Vector Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column | -| `all` | ✅ Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) | - -### FTS Search -|`return_score`| Status | Description | -| --- | --- | --- | -| `relevance` | ✅ Supported | Returns only have the `_relevance_score` column | -| `all` | ✅ Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) | \ No newline at end of file diff --git a/docs/src/scripts/posthog.js b/docs/src/scripts/posthog.js deleted file mode 100644 index 39f61d34..00000000 --- a/docs/src/scripts/posthog.js +++ /dev/null @@ -1,4 +0,0 @@ -window.addEventListener("DOMContentLoaded", (event) => { - !function(t,e){var o,n,p,r;e.__SV||(window.posthog=e,e._i=[],e.init=function(i,s,a){function g(t,e){var o=e.split(".");2==o.length&&(t=t[o[0]],e=o[1]),t[e]=function(){t.push([e].concat(Array.prototype.slice.call(arguments,0)))}}(p=t.createElement("script")).type="text/javascript",p.async=!0,p.src=s.api_host+"/static/array.js",(r=t.getElementsByTagName("script")[0]).parentNode.insertBefore(p,r);var u=e;for(void 0!==a?u=e[a]=[]:a="posthog",u.people=u.people||[],u.toString=function(t){var e="posthog";return"posthog"!==a&&(e+="."+a),t||(e+=" (stub)"),e},u.people.toString=function(){return u.toString(1)+".people (stub)"},o="capture identify alias people.set people.set_once set_config register register_once unregister opt_out_capturing has_opted_out_capturing opt_in_capturing reset isFeatureEnabled onFeatureFlags getFeatureFlag getFeatureFlagPayload reloadFeatureFlags group updateEarlyAccessFeatureEnrollment getEarlyAccessFeatures getActiveMatchingSurveys getSurveys".split(" "),n=0;n - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:exhaustive_search" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:exhaustive_search_async" - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/search.test.ts:import" - - --8<-- "nodejs/examples/search.test.ts:search1" - ``` - - - === "vectordb (deprecated)" - - ```ts - --8<-- "docs/src/search_legacy.ts:import" - - --8<-- "docs/src/search_legacy.ts:search1" - ``` - -By default, `l2` will be used as metric type. You can specify the metric type as -`cosine` or `dot` if required. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:exhaustive_search_cosine" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:exhaustive_search_async_cosine" - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/search.test.ts:search2" - ``` - - === "vectordb (deprecated)" - - ```javascript - --8<-- "docs/src/search_legacy.ts:search2" - ``` - -## Approximate nearest neighbor (ANN) search - -To perform scalable vector retrieval with acceptable latencies, it's common to build a vector index. -While the exhaustive search is guaranteed to always return 100% recall, the approximate nature of -an ANN search means that using an index often involves a trade-off between recall and latency. - -See the [IVF_PQ index](./concepts/index_ivfpq.md) for a deeper description of how `IVF_PQ` -indexes work in LanceDB. - -## Binary vector - -LanceDB supports binary vectors as a data type, and has the ability to search binary vectors with hamming distance. The binary vectors are stored as uint8 arrays (every 8 bits are stored as a byte): - -!!! note - The dim of the binary vector must be a multiple of 8. A vector of dim 128 will be stored as a uint8 array of size 16. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_binary_vector.py:imports" - - --8<-- "python/python/tests/docs/test_binary_vector.py:sync_binary_vector" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_binary_vector.py:imports" - - --8<-- "python/python/tests/docs/test_binary_vector.py:async_binary_vector" - ``` - - === "TypeScript" - - ```ts - --8<-- "nodejs/examples/search.test.ts:import" - - --8<-- "nodejs/examples/search.test.ts:import_bin_util" - - --8<-- "nodejs/examples/search.test.ts:ingest_binary_data" - - --8<-- "nodejs/examples/search.test.ts:search_binary_data" - ``` - - -## Multivector type - -LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali). - -You can index on a column with multivector type and search on it, the query can be single vector or multiple vectors. If the query is multiple vectors `mq`, the similarity (distance) from it to any multivector `mv` in the dataset, is defined as: - -![maxsim](assets/maxsim.png) - -where `sim` is the similarity function (e.g. cosine). - -For now, only `cosine` metric is supported for multivector search. -The vector value type can be `float16`, `float32` or `float64`. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_multivector.py:imports" - - --8<-- "python/python/tests/docs/test_multivector.py:sync_multivector" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_multivector.py:imports" - - --8<-- "python/python/tests/docs/test_multivector.py:async_multivector" - ``` - -## Search with distance range - -You can also search for vectors within a specific distance range from the query vector. This is useful when you want to find vectors that are not just the nearest neighbors, but also those that are within a certain distance. This can be done by using the `distance_range` method. - -=== "Python" - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_distance_range.py:imports" - - --8<-- "python/python/tests/docs/test_distance_range.py:sync_distance_range" - ``` - - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_distance_range.py:imports" - - --8<-- "python/python/tests/docs/test_distance_range.py:async_distance_range" - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/search.test.ts:import" - - --8<-- "nodejs/examples/search.test.ts:distance_range" - ``` - - -## Output search results - -LanceDB returns vector search results via different formats commonly used in python. -Let's create a LanceDB table with a nested schema: - -=== "Python" - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:import-datetime" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_search.py:import-numpy" - --8<-- "python/python/tests/docs/test_search.py:import-pydantic-base-model" - --8<-- "python/python/tests/docs/test_search.py:class-definition" - --8<-- "python/python/tests/docs/test_search.py:create_table_with_nested_schema" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:import-datetime" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb" - --8<-- "python/python/tests/docs/test_search.py:import-lancedb-pydantic" - --8<-- "python/python/tests/docs/test_search.py:import-numpy" - --8<-- "python/python/tests/docs/test_search.py:import-pydantic-base-model" - --8<-- "python/python/tests/docs/test_search.py:class-definition" - --8<-- "python/python/tests/docs/test_search.py:create_table_async_with_nested_schema" - ``` - - ### As a PyArrow table - - Using `to_arrow()` we can get the results back as a pyarrow Table. - This result table has the same columns as the LanceDB table, with - the addition of an `_distance` column for vector search or a `score` - column for full text search. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_as_pyarrow" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_async_as_pyarrow" - ``` - - ### As a Pandas DataFrame - - You can also get the results as a pandas dataframe. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_as_pandas" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_async_as_pandas" - ``` - - While other formats like Arrow/Pydantic/Python dicts have a natural - way to handle nested schemas, pandas can only store nested data as a - python dict column, which makes it difficult to support nested references. - So for convenience, you can also tell LanceDB to flatten a nested schema - when creating the pandas dataframe. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_as_pandas_flatten_true" - ``` - - If your table has a deeply nested struct, you can control how many levels - of nesting to flatten by passing in a positive integer. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_as_pandas_flatten_1" - ``` - !!! note - `flatten` is not yet supported with our asynchronous client. - - ### As a list of Python dicts - - You can of course return results as a list of python dicts. - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_as_list" - ``` - === "Async API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_async_as_list" - ``` - - ### As a list of Pydantic models - - We can add data using Pydantic models, and we can certainly - retrieve results as Pydantic models - - === "Sync API" - - ```python - --8<-- "python/python/tests/docs/test_search.py:search_result_as_pydantic" - ``` - !!! note - `to_pydantic()` is not yet supported with our asynchronous client. - - Note that in this case the extra `_distance` field is discarded since - it's not part of the LanceSchema. diff --git a/docs/src/search_legacy.ts b/docs/src/search_legacy.ts deleted file mode 100644 index 203ffc89..00000000 --- a/docs/src/search_legacy.ts +++ /dev/null @@ -1,42 +0,0 @@ -// --8<-- [start:import] -import * as lancedb from "vectordb"; -// --8<-- [end:import] -import * as fs from "fs"; - -async function setup() { - fs.rmSync("data/sample-lancedb", { recursive: true, force: true }); - const db = await lancedb.connect("data/sample-lancedb"); - - let data = []; - for (let i = 0; i < 10_000; i++) { - data.push({ - vector: Array(1536).fill(i), - id: `${i}`, - content: "", - longId: `${i}`, - }); - } - await db.createTable("my_vectors", data); -} - -async () => { - console.log("search_legacy.ts: start"); - await setup(); - - // --8<-- [start:search1] - const db = await lancedb.connect("data/sample-lancedb"); - const tbl = await db.openTable("my_vectors"); - - const results_1 = await tbl.search(Array(1536).fill(1.2)).limit(10).execute(); - // --8<-- [end:search1] - - // --8<-- [start:search2] - const results_2 = await tbl - .search(Array(1536).fill(1.2)) - .metricType(lancedb.MetricType.Cosine) - .limit(10) - .execute(); - // --8<-- [end:search2] - - console.log("search_legacy.ts: done"); -}; diff --git a/docs/src/sql.md b/docs/src/sql.md deleted file mode 100644 index 41b44491..00000000 --- a/docs/src/sql.md +++ /dev/null @@ -1,203 +0,0 @@ -# Filtering - -## Pre and post-filtering - -LanceDB supports filtering of query results based on metadata fields. By default, post-filtering is -performed on the top-k results returned by the vector search. However, pre-filtering is also an -option that performs the filter prior to vector search. This can be useful to narrow down -the search space of a very large dataset to reduce query latency. - -Note that both pre-filtering and post-filtering can yield false positives. For pre-filtering, if the filter is too selective, it might eliminate relevant items that the vector search would have otherwise identified as a good match. In this case, increasing `nprobes` parameter will help reduce such false positives. It is recommended to call `bypass_vector_index()` if you know that the filter is highly selective. - -Similarly, a highly selective post-filter can lead to false positives. Increasing both `nprobes` and `refine_factor` can mitigate this issue. When deciding between pre-filtering and post-filtering, pre-filtering is generally the safer choice if you're uncertain. - - - - -=== "Python" - - ```python - # Synchronous client - result = tbl.search([0.5, 0.2]).where("id = 10", prefilter=True).limit(1).to_arrow() - # Asynchronous client - result = await async_tbl.query().where("id = 10").nearest_to([0.5, 0.2]).limit(1).to_arrow() - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/filtering.test.ts:search" - ``` - - === "vectordb (deprecated)" - - ```ts - --8<-- "docs/src/sql_legacy.ts:search" - ``` - -!!! note - - Creating a [scalar index](guides/scalar_index.md) accelerates filtering. - -## SQL filters - -Because it's built on top of [DataFusion](https://github.com/apache/arrow-datafusion), LanceDB -embraces the utilization of standard SQL expressions as predicates for filtering operations. -SQL can be used during vector search, update, and deletion operations. - -LanceDB supports a growing list of SQL expressions: - -- `>`, `>=`, `<`, `<=`, `=` -- `AND`, `OR`, `NOT` -- `IS NULL`, `IS NOT NULL` -- `IS TRUE`, `IS NOT TRUE`, `IS FALSE`, `IS NOT FALSE` -- `IN` -- `LIKE`, `NOT LIKE` -- `CAST` -- `regexp_match(column, pattern)` -- [DataFusion Functions](https://arrow.apache.org/datafusion/user-guide/sql/scalar_functions.html) - -For example, the following filter string is acceptable: - -=== "Python" - - ```python - # Synchronous client - tbl.search([100, 102]).where( - "(item IN ('item 0', 'item 2')) AND (id > 10)" - ).to_arrow() - # Asynchronous client - await ( - async_tbl.query() - .where("(item IN ('item 0', 'item 2')) AND (id > 10)") - .nearest_to([100, 102]) - .to_arrow() - ) - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/filtering.test.ts:vec_search" - ``` - - === "vectordb (deprecated)" - - ```ts - --8<-- "docs/src/sql_legacy.ts:vec_search" - ``` - -If your column name contains special characters, upper-case characters, or is a [SQL Keyword](https://docs.rs/sqlparser/latest/sqlparser/keywords/index.html), -you can use backtick (`` ` ``) to escape it. For nested fields, each segment of the -path must be wrapped in backticks. - -=== "SQL" - - ```sql - `CUBE` = 10 AND `UpperCaseName` = '3' AND `column name with space` IS NOT NULL - AND `nested with space`.`inner with space` < 2 - ``` - -!!!warning "Field names containing periods (`.`) are not supported." - -Literals for dates, timestamps, and decimals can be written by writing the string -value after the type name. For example: - -=== "SQL" - - ```sql - date_col = date '2021-01-01' - and timestamp_col = timestamp '2021-01-01 00:00:00' - and decimal_col = decimal(8,3) '1.000' - ``` - -For timestamp columns, the precision can be specified as a number in the type -parameter. Microsecond precision (6) is the default. - -| SQL | Time unit | -| -------------- | ------------ | -| `timestamp(0)` | Seconds | -| `timestamp(3)` | Milliseconds | -| `timestamp(6)` | Microseconds | -| `timestamp(9)` | Nanoseconds | - -LanceDB internally stores data in [Apache Arrow](https://arrow.apache.org/) format. -The mapping from SQL types to Arrow types is: - -| SQL type | Arrow type | -| --------------------------------------------------------- | ------------------ | -| `boolean` | `Boolean` | -| `tinyint` / `tinyint unsigned` | `Int8` / `UInt8` | -| `smallint` / `smallint unsigned` | `Int16` / `UInt16` | -| `int` or `integer` / `int unsigned` or `integer unsigned` | `Int32` / `UInt32` | -| `bigint` / `bigint unsigned` | `Int64` / `UInt64` | -| `float` | `Float32` | -| `double` | `Float64` | -| `decimal(precision, scale)` | `Decimal128` | -| `date` | `Date32` | -| `timestamp` | `Timestamp` [^1] | -| `string` | `Utf8` | -| `binary` | `Binary` | - -[^1]: See precision mapping in previous table. - -## Filtering without Vector Search - -You can also filter your data without search: - -=== "Python" - - ```python - # Synchronous client - tbl.search().where("id = 10").limit(10).to_arrow() - # Asynchronous client - await async_tbl.query().where("id = 10").limit(10).to_arrow() - ``` - -=== "TypeScript" - - === "@lancedb/lancedb" - - ```ts - --8<-- "nodejs/examples/filtering.test.ts:sql_search" - ``` - - === "vectordb (deprecated)" - - ```ts - --8<---- "docs/src/sql_legacy.ts:sql_search" - ``` - -!!!warning "If your table is large, this could potentially return a very large amount of data. Please be sure to use a `limit` clause unless you're sure you want to return the whole result set." diff --git a/docs/src/sql_legacy.ts b/docs/src/sql_legacy.ts deleted file mode 100644 index cf079ebe..00000000 --- a/docs/src/sql_legacy.ts +++ /dev/null @@ -1,39 +0,0 @@ -import * as vectordb from "vectordb"; - -(async () => { - console.log("sql_legacy.ts: start"); - const db = await vectordb.connect("data/sample-lancedb"); - - let data = []; - for (let i = 0; i < 10_000; i++) { - data.push({ - vector: Array(1536).fill(i), - id: i, - item: `item ${i}`, - strId: `${i}`, - }); - } - const tbl = await db.createTable("myVectors", data); - - // --8<-- [start:search] - let result = await tbl - .search(Array(1536).fill(0.5)) - .limit(1) - .filter("id = 10") - .prefilter(true) - .execute(); - // --8<-- [end:search] - - // --8<-- [start:vec_search] - await tbl - .search(Array(1536).fill(0)) - .where("(item IN ('item 0', 'item 2')) AND (id > 10)") - .execute(); - // --8<-- [end:vec_search] - - // --8<-- [start:sql_search] - await tbl.filter("id = 10").limit(10).execute(); - // --8<-- [end:sql_search] - - console.log("sql_legacy.ts: done"); -})(); diff --git a/docs/src/studies/overview.md b/docs/src/studies/overview.md deleted file mode 100644 index 917f39c3..00000000 --- a/docs/src/studies/overview.md +++ /dev/null @@ -1,4 +0,0 @@ -This is a list of benchmarks and reports we've worked on at LanceDB. Some of these are continuously updated, while others are one-off reports. - -- [Improve retrievers with hybrid search and reranking](https://blog.lancedb.com/hybrid-search-and-reranking-report/) - diff --git a/docs/src/styles/extra.css b/docs/src/styles/extra.css index 913c2e35..ec5a6d81 100644 --- a/docs/src/styles/extra.css +++ b/docs/src/styles/extra.css @@ -82,3 +82,20 @@ margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important; } } + +/* Header gradient (only header area) */ +.md-header { + background: linear-gradient(90deg, #3B2E58 0%, #F0B7C1 45%, #E55A2B 100%); + box-shadow: inset 0 1px 0 rgba(255,255,255,0.08), 0 1px 0 rgba(0,0,0,0.08); +} + +/* Same colors as header for tabs (that hold the text) */ +.md-tabs { + background: linear-gradient(90deg, #3B2E58 0%, #F0B7C1 45%, #E55A2B 100%); +} + +/* Dark scheme variant */ +[data-md-color-scheme="slate"] .md-header, +[data-md-color-scheme="slate"] .md-tabs { + background: linear-gradient(90deg, #3B2E58 0%, #F0B7C1 45%, #E55A2B 100%); +} diff --git a/docs/src/styles/fonts/IBMPlexMono-Italic.woff2 b/docs/src/styles/fonts/IBMPlexMono-Italic.woff2 deleted file mode 100644 index f5f64e84..00000000 Binary files a/docs/src/styles/fonts/IBMPlexMono-Italic.woff2 and /dev/null differ diff --git a/docs/src/styles/fonts/IBMPlexMono-Regular.woff2 b/docs/src/styles/fonts/IBMPlexMono-Regular.woff2 deleted file mode 100644 index afc8ede9..00000000 Binary files a/docs/src/styles/fonts/IBMPlexMono-Regular.woff2 and /dev/null differ diff --git a/docs/src/styles/fonts/IBMPlexSans-Italic.woff2 b/docs/src/styles/fonts/IBMPlexSans-Italic.woff2 deleted file mode 100644 index 423d3015..00000000 Binary files a/docs/src/styles/fonts/IBMPlexSans-Italic.woff2 and /dev/null differ diff --git a/docs/src/styles/fonts/IBMPlexSans-Regular.woff2 b/docs/src/styles/fonts/IBMPlexSans-Regular.woff2 deleted file mode 100644 index c94375f4..00000000 Binary files a/docs/src/styles/fonts/IBMPlexSans-Regular.woff2 and /dev/null differ diff --git a/docs/src/styles/fonts/IBMPlexSans-SemiBold.woff2 b/docs/src/styles/fonts/IBMPlexSans-SemiBold.woff2 deleted file mode 100644 index fdde0313..00000000 Binary files a/docs/src/styles/fonts/IBMPlexSans-SemiBold.woff2 and /dev/null differ diff --git a/docs/src/styles/fonts/Inter_18pt-Black.ttf b/docs/src/styles/fonts/Inter_18pt-Black.ttf new file mode 100644 index 00000000..89673de1 Binary files /dev/null and b/docs/src/styles/fonts/Inter_18pt-Black.ttf differ diff --git a/docs/src/styles/fonts/Inter_18pt-Bold.ttf b/docs/src/styles/fonts/Inter_18pt-Bold.ttf new file mode 100644 index 00000000..57704d10 Binary files /dev/null and b/docs/src/styles/fonts/Inter_18pt-Bold.ttf differ diff --git a/docs/src/styles/fonts/Inter_18pt-Italic.ttf b/docs/src/styles/fonts/Inter_18pt-Italic.ttf new file mode 100644 index 00000000..14d3595b Binary files /dev/null and b/docs/src/styles/fonts/Inter_18pt-Italic.ttf differ diff --git a/docs/src/styles/global.css b/docs/src/styles/global.css index feca50ef..823b94fa 100644 --- a/docs/src/styles/global.css +++ b/docs/src/styles/global.css @@ -1,35 +1,37 @@ :root { - --md-primary-fg-color: #625eff; + --md-primary-fg-color: #F46F52; + --md-accent-fg-color: #F46F52; --md-text-font: "IBMPlexSans", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; --md-code-font: "IBMPlexMono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; } [data-md-color-scheme="slate"] { --md-hue: 210; + --md-primary-fg-color: #E35B3F; + --md-accent-fg-color: #E35B3F; } @font-face { - font-family: "IBMPlexSans"; - src: local("IBMPlexSans"), url("fonts/IBMPlexSans-Regular.woff2"); + font-family: "Inter"; + src: local("Inter_18pt"), url("Inter_18pt-Black.ttf"); } @font-face { - font-family: "IBMPlexSans"; - src: local("IBMPlexSans-Italic"), url("fonts/IBMPlexSans-Italic.woff2"); + font-family: "Inter"; + src: local("Inter_18pt-Italic"), url("Inter_18pt-Italic.ttf"); font-style: italic; } @font-face { - font-family: "IBMPlexSans"; - src: local("IBMPlexSans-Bold"), url("fonts/IBMPlexSans-SemiBold.woff2"); + font-family: "Inter"; + src: local("Inter_18pt-Bold"), url("Inter_18pt-Bold.ttf"); font-weight: bold; } -@font-face { - font-family: "IBMPlexMono"; +/* @font-face { + font-family: "Inter"; src: local("IBM Plex Mono"), local("IBM-Plex-Mono"), local("IBMPlexMono"), local("IBM-Plex-Mono-Regular"), local("IBMPlexMono-Regular"), url("fonts/IBMPlexMono-Regular.woff2"); } @font-face { - font-family: "IBMPlexMono"; + font-family: "Inter"; src: local("IBM Plex Mono Italic"), local("IBM-Plex-Mono-Italic"), local("IBMPlexMono-Italic"), url("fonts/IBMPlexMono-Italic.woff2"); font-style: italic; -} - +} */ diff --git a/docs/src/troubleshooting.md b/docs/src/troubleshooting.md deleted file mode 100644 index 38f3a4e1..00000000 --- a/docs/src/troubleshooting.md +++ /dev/null @@ -1,43 +0,0 @@ -## Getting help - -The following sections provide various diagnostics and troubleshooting tips for LanceDB. -These can help you provide additional information when asking questions or making -error reports. - -For trouble shooting, the best place to ask is in our Discord, under the relevant -language channel. By asking in the language-specific channel, it makes it more -likely that someone who knows the answer will see your question. - -## Common issues - -* Multiprocessing with `fork` is not supported. You should use `spawn` instead. - -## Enabling logging - -To provide more information, especially for LanceDB Cloud related issues, enable -debug logging. You can set the `LANCEDB_LOG` environment variable: - -```shell -export LANCEDB_LOG=debug -``` - -You can turn off colors and formatting in the logs by setting - -```shell -export LANCEDB_LOG_STYLE=never -``` - -## Explaining query plans - -If you have slow queries or unexpected query results, it can be helpful to -print the resolved query plan. You can use the `explain_plan` method to do this: - -* Python Sync: [LanceQueryBuilder.explain_plan][lancedb.query.LanceQueryBuilder.explain_plan] -* Python Async: [AsyncQueryBase.explain_plan][lancedb.query.AsyncQueryBase.explain_plan] -* Node @lancedb/lancedb: [LanceQueryBuilder.explainPlan](/lancedb/js/classes/QueryBase/#explainplan) - -To understand how a query was actually executed—including metrics like execution time, number of rows processed, I/O stats, and more—use the analyze_plan method. This executes the query and returns a physical execution plan annotated with runtime metrics, making it especially helpful for performance tuning and debugging. - -* Python Sync: [LanceQueryBuilder.analyze_plan][lancedb.query.LanceQueryBuilder.analyze_plan] -* Python Async: [AsyncQueryBase.analyze_plan][lancedb.query.AsyncQueryBase.analyze_plan] -* Node @lancedb/lancedb: [LanceQueryBuilder.analyzePlan](/lancedb/js/classes/QueryBase/#analyzePlan) diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py deleted file mode 100755 index 0bd38076..00000000 --- a/docs/test/md_testing.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 - -import glob -from typing import Iterator, List -from pathlib import Path - -glob_string = "../src/**/*.md" -excluded_globs = [ - "../src/fts.md", - "../src/embedding.md", - "../src/examples/*.md", - "../src/integrations/*.md", - "../src/guides/tables.md", - "../src/guides/tables/merge_insert.md", - "../src/python/duckdb.md", - "../src/python/pandas_and_pyarrow.md", - "../src/python/polars_arrow.md", - "../src/python/pydantic.md", - "../src/embeddings/*.md", - "../src/concepts/*.md", - "../src/ann_indexes.md", - "../src/basic.md", - "../src/search.md", - "../src/hybrid_search/hybrid_search.md", - "../src/reranking/*.md", - "../src/guides/tuning_retrievers/*.md", - "../src/embeddings/available_embedding_models/text_embedding_functions/*.md", - "../src/embeddings/available_embedding_models/multimodal_embedding_functions/*.md", - "../src/rag/*.md", - "../src/rag/advanced_techniques/*.md", - "../src/guides/scalar_index.md", - "../src/guides/storage.md", - "../src/search.md", - "../src/guides/sql_querying.md", -] - -python_prefix = "py" -python_file = ".py" -python_folder = "python" - -files = glob.glob(glob_string, recursive=True) -excluded_files = [ - f - for excluded_glob in excluded_globs - for f in glob.glob(excluded_glob, recursive=True) -] - - -def yield_lines(lines: Iterator[str], prefix: str, suffix: str): - in_code_block = False - # Python code has strict indentation - strip_length = 0 - skip_test = False - for line in lines: - if "skip-test" in line: - skip_test = True - if line.strip().startswith(prefix + python_prefix): - in_code_block = True - strip_length = len(line) - len(line.lstrip()) - elif in_code_block and line.strip().startswith(suffix): - in_code_block = False - if not skip_test: - yield "\n" - skip_test = False - elif in_code_block: - if not skip_test: - yield line[strip_length:] - - -def wrap_async(lines: List[str]) -> List[str]: - # Indent all the lines - lines = [" " + line for line in lines] - # Put all lines in `async def main():` - lines = ["async def main():\n"] + lines - # Put `import asyncio\n asyncio.run(main())` at the end - lines = lines + ["\n", "import asyncio\n", "asyncio.run(main())\n"] - return lines - - -for file in filter(lambda file: file not in excluded_files, files): - with open(file, "r") as f: - lines = list(yield_lines(iter(f), "```", "```")) - - if len(lines) > 0: - if any("await" in line for line in lines): - lines = wrap_async(lines) - - print(lines) - out_path = ( - Path(python_folder) - / Path(file).name.strip(".md") - / (Path(file).name.strip(".md") + python_file) - ) - print(out_path) - out_path.parent.mkdir(exist_ok=True, parents=True) - with open(out_path, "w") as out: - out.writelines(lines) diff --git a/docs/test/requirements.txt b/docs/test/requirements.txt deleted file mode 100644 index 3e7a8611..00000000 --- a/docs/test/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ --e ../../python -numpy -pandas -pylance -duckdb -tantivy==0.20.1 ---extra-index-url https://download.pytorch.org/whl/cpu -torch -polars>=0.19, <=1.3.0 -datafusion diff --git a/python/python/lancedb/__init__.py b/python/python/lancedb/__init__.py index a29f54a8..de68e085 100644 --- a/python/python/lancedb/__init__.py +++ b/python/python/lancedb/__init__.py @@ -35,7 +35,7 @@ def connect( session: Optional[Session] = None, **kwargs: Any, ) -> DBConnection: - """Connect to a LanceDB database. + """Connect to a LanceDB database. YAY! Parameters ----------