From 6008a8257bdd7df0a5151a98ed1953371feaf1cb Mon Sep 17 00:00:00 2001 From: Matt Basta Date: Tue, 13 Aug 2024 13:06:15 -0400 Subject: [PATCH 01/34] fix: remove native.d.ts from .npmignore (#1531) This removes the type definitions for a number of important TypeScript interfaces from `.npmignore` so that the package is not incorrectly typed `any` in a number of places. --- Presently the `opts` argument to `lancedb.connect` is typed `any`, even though it shouldn't be. image Clicking into the type definitions for the published package, it has the correct type signature: image However, `ConnectionOptions` is imported from `native.js` (along with a number of other imports a bit further down): image This is not otherwise an issue, except that the type definitions for `native.js` are not included in the published package: image I haven't compiled the Rust code and run the build script, but I strongly suspect that disincluding the type definitions in `.npmignore` is ultimately the root cause here. --- nodejs/.npmignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nodejs/.npmignore b/nodejs/.npmignore index ebe93e01..2d0d1d82 100644 --- a/nodejs/.npmignore +++ b/nodejs/.npmignore @@ -20,6 +20,5 @@ Cargo.toml biome.json build.rs jest.config.js -native.d.ts tsconfig.json -typedoc.json \ No newline at end of file +typedoc.json From b3daa25f46b5e954e4fbb540976a89ebbdef8adb Mon Sep 17 00:00:00 2001 From: Ryan Green Date: Tue, 13 Aug 2024 16:05:42 -0230 Subject: [PATCH 02/34] feat: allow new scalar index types to be created in remote table (#1538) --- python/python/lancedb/remote/table.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 22497359..3d1669ab 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -15,7 +15,7 @@ import logging import uuid from concurrent.futures import Future from functools import cached_property -from typing import Dict, Iterable, Optional, Union +from typing import Dict, Iterable, Optional, Union, Literal import pyarrow as pa from lance import json_to_schema @@ -97,6 +97,7 @@ class RemoteTable(Table): def create_scalar_index( self, column: str, + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar", ): """Creates a scalar index Parameters @@ -104,8 +105,10 @@ class RemoteTable(Table): column : str The column to be indexed. Must be a boolean, integer, float, or string column. + index_type : str + The index type of the scalar index. Must be "scalar" (BTREE), + "BTREE", "BITMAP", or "LABEL_LIST" """ - index_type = "scalar" data = { "column": column, From 501817cfac6bf0434065d3433854886bd34e97ea Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Wed, 14 Aug 2024 23:44:31 +0800 Subject: [PATCH 03/34] chore: bump the required python version to 3.9 (#1541) Signed-off-by: BubbleCal --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 9f48ad91..5e22fd47 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,7 +18,7 @@ description = "lancedb" authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }] license = { file = "LICENSE" } readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = [ "data-format", "data-science", From d2caa5e20295a7e74b09d3a32ef112eb8b23e995 Mon Sep 17 00:00:00 2001 From: Gagan Bhullar Date: Wed, 14 Aug 2024 09:53:53 -0600 Subject: [PATCH 04/34] feat(nodejs): add delete unverified (#1530) PR fixes part of #1527 --- nodejs/__test__/table.test.ts | 15 +++++++++++++++ nodejs/lancedb/table.ts | 6 +++++- nodejs/src/table.rs | 8 ++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 7cc9556a..34773094 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -726,6 +726,21 @@ describe("when optimizing a dataset", () => { expect(stats.prune.bytesRemoved).toBeGreaterThan(0); expect(stats.prune.oldVersionsRemoved).toBe(3); }); + + it("delete unverified", async () => { + const version = await table.version(); + const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${version - 1}.manifest`; + fs.rmSync(versionFile); + + let stats = await table.optimize({ deleteUnverified: false }); + expect(stats.prune.oldVersionsRemoved).toBe(0); + + stats = await table.optimize({ + cleanupOlderThan: new Date(), + deleteUnverified: true, + }); + expect(stats.prune.oldVersionsRemoved).toBeGreaterThan(1); + }); }); describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])( diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts index 83758b39..11a76e9c 100644 --- a/nodejs/lancedb/table.ts +++ b/nodejs/lancedb/table.ts @@ -84,6 +84,7 @@ export interface OptimizeOptions { * tbl.cleanupOlderVersions(new Date()); */ cleanupOlderThan: Date; + deleteUnverified: boolean; } /** @@ -671,7 +672,10 @@ export class LocalTable extends Table { cleanupOlderThanMs = new Date().getTime() - options.cleanupOlderThan.getTime(); } - return await this.inner.optimize(cleanupOlderThanMs); + return await this.inner.optimize( + cleanupOlderThanMs, + options?.deleteUnverified, + ); } async listIndices(): Promise { diff --git a/nodejs/src/table.rs b/nodejs/src/table.rs index 90925d1d..e0a17565 100644 --- a/nodejs/src/table.rs +++ b/nodejs/src/table.rs @@ -265,7 +265,11 @@ impl Table { } #[napi(catch_unwind)] - pub async fn optimize(&self, older_than_ms: Option) -> napi::Result { + pub async fn optimize( + &self, + older_than_ms: Option, + delete_unverified: Option, + ) -> napi::Result { let inner = self.inner_ref()?; let older_than = if let Some(ms) = older_than_ms { @@ -292,7 +296,7 @@ impl Table { let prune_stats = inner .optimize(OptimizeAction::Prune { older_than, - delete_unverified: None, + delete_unverified, error_if_tagged_old_versions: None, }) .await From b624fc59eb3257131d6e268fe5a93c50e9b8349c Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 15 Aug 2024 11:35:16 +0800 Subject: [PATCH 05/34] docs: add `create_fts_index` doc in Python API Reference (#1533) resolve #1313 --------- Signed-off-by: BubbleCal --- python/python/lancedb/table.py | 69 ++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 0edb22ed..bf069d2c 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -403,6 +403,47 @@ class Table(ABC): """ raise NotImplementedError + def create_fts_index( + self, + field_names: Union[str, List[str]], + ordering_field_names: Union[str, List[str]] = None, + *, + replace: bool = False, + writer_heap_size: Optional[int] = 1024 * 1024 * 1024, + tokenizer_name: str = "default", + use_tantivy: bool = True, + ): + """Create a full-text search index on the table. + + Warning - this API is highly experimental and is highly likely to change + in the future. + + Parameters + ---------- + field_names: str or list of str + The name(s) of the field to index. + can be only str if use_tantivy=True for now. + replace: bool, default False + If True, replace the existing index if it exists. Note that this is + not yet an atomic operation; the index will be temporarily + unavailable while the new index is being created. + writer_heap_size: int, default 1GB + Only available with use_tantivy=True + ordering_field_names: + A list of unsigned type fields to index to optionally order + results on at search time. + only available with use_tantivy=True + tokenizer_name: str, default "default" + The tokenizer to use for the index. Can be "raw", "default" or the 2 letter + language code followed by "_stem". So for english it would be "en_stem". + For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html + only available with use_tantivy=True for now + use_tantivy: bool, default True + If True, use the legacy full-text search implementation based on tantivy. + If False, use the new full-text search implementation based on lance-index. + """ + raise NotImplementedError + @abstractmethod def add( self, @@ -1201,34 +1242,6 @@ class LanceTable(Table): tokenizer_name: str = "default", use_tantivy: bool = True, ): - """Create a full-text search index on the table. - - Warning - this API is highly experimental and is highly likely to change - in the future. - - Parameters - ---------- - field_names: str or list of str - The name(s) of the field to index. - can be only str if use_tantivy=True for now. - replace: bool, default False - If True, replace the existing index if it exists. Note that this is - not yet an atomic operation; the index will be temporarily - unavailable while the new index is being created. - writer_heap_size: int, default 1GB - ordering_field_names: - A list of unsigned type fields to index to optionally order - results on at search time. - only available with use_tantivy=True - tokenizer_name: str, default "default" - The tokenizer to use for the index. Can be "raw", "default" or the 2 letter - language code followed by "_stem". So for english it would be "en_stem". - For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html - only available with use_tantivy=True for now - use_tantivy: bool, default False - If True, use the legacy full-text search implementation based on tantivy. - If False, use the new full-text search implementation based on lance-index. - """ if not use_tantivy: if not isinstance(field_names, str): raise ValueError("field_names must be a string when use_tantivy=False") From 20faa4424bd3cef9b01154d7cbe6b40d285f0d4b Mon Sep 17 00:00:00 2001 From: Gagan Bhullar Date: Thu, 15 Aug 2024 10:01:32 -0600 Subject: [PATCH 06/34] feat(python): add delete unverified parameter (#1542) PR fixes #1527 --- python/python/lancedb/table.py | 12 ++++++++++-- python/python/tests/test_table.py | 23 +++++++++++++++++++++++ python/src/table.rs | 3 ++- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index bf069d2c..18f6f90b 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2451,7 +2451,10 @@ class AsyncTable: await self._inner.restore() async def optimize( - self, *, cleanup_older_than: Optional[timedelta] = None + self, + *, + cleanup_older_than: Optional[timedelta] = None, + delete_unverified: bool = False, ) -> OptimizeStats: """ Optimize the on-disk data and indices for better performance. @@ -2470,6 +2473,11 @@ class AsyncTable: All files belonging to versions older than this will be removed. Set to 0 days to remove all versions except the latest. The latest version is never removed. + delete_unverified: bool, default False + Files leftover from a failed transaction may appear to be part of an + in-progress operation (e.g. appending new data) and these files will not + be deleted unless they are at least 7 days old. If delete_unverified is True + then these files will be deleted regardless of their age. Experimental API ---------------- @@ -2491,7 +2499,7 @@ class AsyncTable: """ if cleanup_older_than is not None: cleanup_older_than = round(cleanup_older_than.total_seconds() * 1000) - return await self._inner.optimize(cleanup_older_than) + return await self._inner.optimize(cleanup_older_than, delete_unverified) async def list_indices(self) -> IndexConfig: """ diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 832fa76e..0d6beeb4 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -8,6 +8,7 @@ from pathlib import Path from time import sleep from typing import List from unittest.mock import PropertyMock, patch +import os import lance import lancedb @@ -1052,3 +1053,25 @@ async def test_optimize(db_async: AsyncConnection): assert stats.prune.old_versions_removed == 3 assert await table.query().to_arrow() == pa.table({"x": [[1], [2]]}) + + +@pytest.mark.asyncio +async def test_optimize_delete_unverified(db_async: AsyncConnection, tmp_path): + table = await db_async.create_table( + "test", + data=[{"x": [1]}], + ) + await table.add( + data=[ + {"x": [2]}, + ], + ) + version = await table.version() + path = tmp_path / "test.lance" / "_versions" / f"{version - 1}.manifest" + os.remove(path) + stats = await table.optimize(delete_unverified=False) + assert stats.prune.old_versions_removed == 0 + stats = await table.optimize( + cleanup_older_than=timedelta(seconds=0), delete_unverified=True + ) + assert stats.prune.old_versions_removed == 2 diff --git a/python/src/table.rs b/python/src/table.rs index 497b0ca2..346c14d2 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -248,6 +248,7 @@ impl Table { pub fn optimize( self_: PyRef<'_, Self>, cleanup_since_ms: Option, + delete_unverified: Option, ) -> PyResult> { let inner = self_.inner_ref()?.clone(); let older_than = if let Some(ms) = cleanup_since_ms { @@ -275,7 +276,7 @@ impl Table { let prune_stats = inner .optimize(OptimizeAction::Prune { older_than, - delete_unverified: None, + delete_unverified, error_if_tagged_old_versions: None, }) .await From 0fa50775d6f4533ca8bc2b45ccd77deefa75e2fc Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 16 Aug 2024 12:01:05 +0800 Subject: [PATCH 07/34] feat: support to query/index FTS on RemoteTable/AsyncTable (#1537) Signed-off-by: BubbleCal --- python/python/lancedb/_lancedb.pyi | 1 + python/python/lancedb/db.py | 8 +-- python/python/lancedb/index.py | 12 ++++ python/python/lancedb/query.py | 82 +++++++++++++++++---------- python/python/lancedb/remote/db.py | 1 + python/python/lancedb/remote/table.py | 63 +++++++++++++------- python/python/lancedb/table.py | 62 ++++++++++---------- python/python/tests/test_fts.py | 74 +++++++++++++++++++----- python/python/tests/test_s3.py | 3 +- python/python/tests/test_table.py | 2 +- python/src/query.rs | 23 +++++++- 11 files changed, 229 insertions(+), 102 deletions(-) diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index a143e308..c4642637 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -74,6 +74,7 @@ class Query: def select(self, columns: Tuple[str, str]): ... def limit(self, limit: int): ... def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ... + def nearest_to_text(self, query: dict) -> Query: ... async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ... class VectorQuery: diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py index 50046080..1c77b299 100644 --- a/python/python/lancedb/db.py +++ b/python/python/lancedb/db.py @@ -276,6 +276,10 @@ class DBConnection(EnforceOverrides): """ raise NotImplementedError + @property + def uri(self) -> str: + return self._uri + class LanceDBConnection(DBConnection): """ @@ -340,10 +344,6 @@ class LanceDBConnection(DBConnection): val += ")" return val - @property - def uri(self) -> str: - return self._uri - async def _async_get_table_names(self, start_after: Optional[str], limit: int): conn = AsyncConnection(await lancedb_connect(self.uri)) return await conn.table_names(start_after=start_after, limit=limit) diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index f9dd7900..2e0c7b95 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -70,6 +70,18 @@ class LabelList: self._inner = LanceDbIndex.label_list() +class FTS: + """Describe a FTS index configuration. + + `FTS` is a full-text search index that can be used on `String` columns + + For example, it works with `title`, `description`, `content`, etc. + """ + + def __init__(self): + self._inner = LanceDbIndex.fts() + + class IvfPq: """Describes an IVF PQ Index diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 8564575f..874a606a 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -15,7 +15,6 @@ from __future__ import annotations from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor -from pathlib import Path from typing import ( TYPE_CHECKING, Dict, @@ -38,7 +37,7 @@ from .arrow import AsyncRecordBatchReader from .common import VEC from .rerankers.base import Reranker from .rerankers.linear_combination import LinearCombinationReranker -from .util import fs_from_uri, safe_import_pandas +from .util import safe_import_pandas if TYPE_CHECKING: import PIL @@ -174,7 +173,9 @@ class LanceQueryBuilder(ABC): if isinstance(query, str): # fts return LanceFtsQueryBuilder( - table, query, ordering_field_name=ordering_field_name + table, + query, + ordering_field_name=ordering_field_name, ) if isinstance(query, list): @@ -681,6 +682,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): self._phrase_query = False self.ordering_field_name = ordering_field_name self._reranker = None + if isinstance(fts_columns, str): + fts_columns = [fts_columns] self._fts_columns = fts_columns def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder: @@ -701,8 +704,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): return self def to_arrow(self) -> pa.Table: - tantivy_index_path = self._table._get_fts_index_path() - if Path(tantivy_index_path).exists(): + path, fs, exist = self._table._get_fts_index_path() + if exist: return self.tantivy_to_arrow() query = self._query @@ -711,23 +714,20 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): "Phrase query is not yet supported in Lance FTS. " "Use tantivy-based index instead for now." ) - if self._reranker: - raise NotImplementedError( - "Reranking is not yet supported in Lance FTS. " - "Use tantivy-based index instead for now." - ) - ds = self._table.to_lance() - return ds.to_table( + query = Query( columns=self._columns, filter=self._where, - limit=self._limit, + k=self._limit, prefilter=self._prefilter, with_row_id=self._with_row_id, full_text_query={ "query": query, "columns": self._fts_columns, }, + vector=[], ) + results = self._table._execute_query(query) + return results.read_all() def tantivy_to_arrow(self) -> pa.Table: try: @@ -740,24 +740,24 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): from .fts import search_index # get the index path - index_path = self._table._get_fts_index_path() - - # Check that we are on local filesystem - fs, _path = fs_from_uri(index_path) - if not isinstance(fs, pa_fs.LocalFileSystem): - raise NotImplementedError( - "Full-text search is only supported on the local filesystem" - ) + path, fs, exist = self._table._get_fts_index_path() # check if the index exist - if not Path(index_path).exists(): + if not exist: raise FileNotFoundError( "Fts index does not exist. " "Please first call table.create_fts_index(['']) to " "create the fts index." ) + + # Check that we are on local filesystem + if not isinstance(fs, pa_fs.LocalFileSystem): + raise NotImplementedError( + "Tantivy-based full text search " + "is only supported on the local filesystem" + ) # open the index - index = tantivy.Index.open(index_path) + index = tantivy.Index.open(path) # get the scores and doc ids query = self._query if self._phrase_query: @@ -851,7 +851,6 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): def __init__(self, table: "Table", query: str, vector_column: str): super().__init__(table) - self._validate_fts_index() vector_query, fts_query = self._validate_query(query) self._fts_query = LanceFtsQueryBuilder(table, fts_query) vector_query = self._query_to_vector(table, vector_query, vector_column) @@ -859,12 +858,6 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): self._norm = "score" self._reranker = LinearCombinationReranker(weight=0.7, fill=1.0) - def _validate_fts_index(self): - if self._table._get_fts_index_path() is None: - raise ValueError( - "Please create a full-text search index " "to perform hybrid search." - ) - def _validate_query(self, query): # Temp hack to support vectorized queries for hybrid search if isinstance(query, str): @@ -1354,6 +1347,35 @@ class AsyncQuery(AsyncQueryBase): self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector)) ) + def nearest_to_text( + self, query: str, columns: Union[str, List[str]] = None + ) -> AsyncQuery: + """ + Find the documents that are most relevant to the given text query. + + This method will perform a full text search on the table and return + the most relevant documents. The relevance is determined by BM25. + + The columns to search must be with native FTS index + (Tantivy-based can't work with this method). + + By default, all indexed columns are searched, + now only one column can be searched at a time. + + Parameters + ---------- + query: str + The text query to search for. + columns: str or list of str, default None + The columns to search in. If None, all indexed columns are searched. + For now only one column can be searched at a time. + """ + if isinstance(columns, str): + columns = [columns] + return AsyncQuery( + self._inner.nearest_to_text({"query": query, "columns": columns}) + ) + class AsyncVectorQuery(AsyncQueryBase): def __init__(self, inner: LanceVectorQuery): diff --git a/python/python/lancedb/remote/db.py b/python/python/lancedb/remote/db.py index 6f51f79e..0dd6bb6d 100644 --- a/python/python/lancedb/remote/db.py +++ b/python/python/lancedb/remote/db.py @@ -49,6 +49,7 @@ class RemoteDBConnection(DBConnection): parsed = urlparse(db_url) if parsed.scheme != "db": raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://") + self._uri = str(db_url) self.db_name = parsed.netloc self.api_key = api_key self._client = RestfulLanceDBClient( diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 3d1669ab..596e7b81 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -35,10 +35,10 @@ from .db import RemoteDBConnection class RemoteTable(Table): def __init__(self, conn: RemoteDBConnection, name: str): self._conn = conn - self._name = name + self.name = name def __repr__(self) -> str: - return f"RemoteTable({self._conn.db_name}.{self._name})" + return f"RemoteTable({self._conn.db_name}.{self.name})" def __len__(self) -> int: self.count_rows(None) @@ -49,14 +49,14 @@ class RemoteTable(Table): of this Table """ - resp = self._conn._client.post(f"/v1/table/{self._name}/describe/") + resp = self._conn._client.post(f"/v1/table/{self.name}/describe/") schema = json_to_schema(resp["schema"]) return schema @property def version(self) -> int: """Get the current version of the table""" - resp = self._conn._client.post(f"/v1/table/{self._name}/describe/") + resp = self._conn._client.post(f"/v1/table/{self.name}/describe/") return resp["version"] @cached_property @@ -84,13 +84,13 @@ class RemoteTable(Table): def list_indices(self): """List all the indices on the table""" - resp = self._conn._client.post(f"/v1/table/{self._name}/index/list/") + resp = self._conn._client.post(f"/v1/table/{self.name}/index/list/") return resp def index_stats(self, index_uuid: str): """List all the stats of a specified index""" resp = self._conn._client.post( - f"/v1/table/{self._name}/index/{index_uuid}/stats/" + f"/v1/table/{self.name}/index/{index_uuid}/stats/" ) return resp @@ -116,11 +116,27 @@ class RemoteTable(Table): "replace": True, } resp = self._conn._client.post( - f"/v1/table/{self._name}/create_scalar_index/", data=data + f"/v1/table/{self.name}/create_scalar_index/", data=data ) return resp + def create_fts_index( + self, + column: str, + *, + replace: bool = False, + ): + data = { + "column": column, + "index_type": "FTS", + "replace": replace, + } + resp = self._conn._client.post( + f"/v1/table/{self.name}/create_index/", data=data + ) + return resp + def create_index( self, metric="L2", @@ -194,7 +210,7 @@ class RemoteTable(Table): "index_cache_size": index_cache_size, } resp = self._conn._client.post( - f"/v1/table/{self._name}/create_index/", data=data + f"/v1/table/{self.name}/create_index/", data=data ) return resp @@ -241,7 +257,7 @@ class RemoteTable(Table): request_id = uuid.uuid4().hex self._conn._client.post( - f"/v1/table/{self._name}/insert/", + f"/v1/table/{self.name}/insert/", data=payload, params={"request_id": request_id, "mode": mode}, content_type=ARROW_STREAM_CONTENT_TYPE, @@ -251,6 +267,7 @@ class RemoteTable(Table): self, query: Union[VEC, str], vector_column_name: Optional[str] = None, + query_type="auto", ) -> LanceVectorQueryBuilder: """Create a search query to find the nearest neighbors of the given query vector. We currently support [vector search][search] @@ -310,10 +327,18 @@ class RemoteTable(Table): - and also the "_distance" column which is the distance between the query vector and the returned vector. """ - if vector_column_name is None: - vector_column_name = inf_vector_column_query(self.schema) - query = LanceQueryBuilder._query_to_vector(self, query, vector_column_name) - return LanceVectorQueryBuilder(self, query, vector_column_name) + if vector_column_name is None and query is not None and query_type != "fts": + try: + vector_column_name = inf_vector_column_query(self.schema) + except Exception as e: + raise e + + return LanceQueryBuilder.create( + self, + query, + query_type, + vector_column_name=vector_column_name, + ) def _execute_query( self, query: Query, batch_size: Optional[int] = None @@ -342,12 +367,12 @@ class RemoteTable(Table): v = list(v) q = query.copy() q.vector = v - results.append(submit(self._name, q)) + results.append(submit(self.name, q)) return pa.concat_tables( [add_index(r.result().to_arrow(), i) for i, r in enumerate(results)] ).to_reader() else: - result = self._conn._client.query(self._name, query) + result = self._conn._client.query(self.name, query) return result.to_arrow().to_reader() def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder: @@ -397,7 +422,7 @@ class RemoteTable(Table): ) self._conn._client.post( - f"/v1/table/{self._name}/merge_insert/", + f"/v1/table/{self.name}/merge_insert/", data=payload, params=params, content_type=ARROW_STREAM_CONTENT_TYPE, @@ -451,7 +476,7 @@ class RemoteTable(Table): 0 2 [3.0, 4.0] 85.0 # doctest: +SKIP """ payload = {"predicate": predicate} - self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload) + self._conn._client.post(f"/v1/table/{self.name}/delete/", data=payload) def update( self, @@ -512,7 +537,7 @@ class RemoteTable(Table): updates = [[k, v] for k, v in values_sql.items()] payload = {"predicate": where, "updates": updates} - self._conn._client.post(f"/v1/table/{self._name}/update/", data=payload) + self._conn._client.post(f"/v1/table/{self.name}/update/", data=payload) def cleanup_old_versions(self, *_): """cleanup_old_versions() is not supported on the LanceDB cloud""" @@ -529,7 +554,7 @@ class RemoteTable(Table): def count_rows(self, filter: Optional[str] = None) -> int: payload = {"predicate": filter} resp = self._conn._client.post( - f"/v1/table/{self._name}/count_rows/", data=payload + f"/v1/table/{self.name}/count_rows/", data=payload ) return resp diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 18f6f90b..6f89e0f7 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -51,7 +51,7 @@ if TYPE_CHECKING: from lance.dataset import CleanupStats, ReaderLike from ._lancedb import Table as LanceDBTable, OptimizeStats from .db import LanceDBConnection - from .index import BTree, IndexConfig, IvfPq, Bitmap, LabelList + from .index import BTree, IndexConfig, IvfPq, Bitmap, LabelList, FTS pd = safe_import_pandas() @@ -840,6 +840,18 @@ class Table(ABC): The names of the columns to drop. """ + @cached_property + def _dataset_uri(self) -> str: + return _table_uri(self._conn.uri, self.name) + + def _get_fts_index_path(self) -> Tuple[str, pa_fs.FileSystem, bool]: + if get_uri_scheme(self._dataset_uri) != "file": + return ("", None, False) + path = join_uri(self._dataset_uri, "_indices", "fts") + fs, path = fs_from_uri(path) + index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound + return (path, fs, index_exists) + class _LanceDatasetRef(ABC): @property @@ -979,10 +991,6 @@ class LanceTable(Table): # Cacheable since it's deterministic return _table_path(self._conn.uri, self.name) - @cached_property - def _dataset_uri(self) -> str: - return _table_uri(self._conn.uri, self.name) - @property def _dataset(self) -> LanceDataset: return self._ref.dataset @@ -1247,9 +1255,8 @@ class LanceTable(Table): raise ValueError("field_names must be a string when use_tantivy=False") # delete the existing legacy index if it exists if replace: - fs, path = fs_from_uri(self._get_fts_index_path()) - index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound - if index_exists: + path, fs, exist = self._get_fts_index_path() + if exist: fs.delete_dir(path) self._dataset_mut.create_scalar_index( field_names, index_type="INVERTED", replace=replace @@ -1264,9 +1271,8 @@ class LanceTable(Table): if isinstance(ordering_field_names, str): ordering_field_names = [ordering_field_names] - fs, path = fs_from_uri(self._get_fts_index_path()) - index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound - if index_exists: + path, fs, exist = self._get_fts_index_path() + if exist: if not replace: raise ValueError("Index already exists. Use replace=True to overwrite.") fs.delete_dir(path) @@ -1277,7 +1283,7 @@ class LanceTable(Table): ) index = create_index( - self._get_fts_index_path(), + path, field_names, ordering_fields=ordering_field_names, tokenizer_name=tokenizer_name, @@ -1290,13 +1296,6 @@ class LanceTable(Table): writer_heap_size=writer_heap_size, ) - def _get_fts_index_path(self): - if get_uri_scheme(self._dataset_uri) != "file": - raise NotImplementedError( - "Full-text search is not supported on object stores." - ) - return join_uri(self._dataset_uri, "_indices", "tantivy") - def add( self, data: DATA, @@ -1492,14 +1491,11 @@ class LanceTable(Table): and also the "_distance" column which is the distance between the query vector and the returned vector. """ - if vector_column_name is None and query is not None: + if vector_column_name is None and query is not None and query_type != "fts": try: vector_column_name = inf_vector_column_query(self.schema) except Exception as e: - if query_type == "fts": - vector_column_name = "" - else: - raise e + raise e return LanceQueryBuilder.create( self, @@ -1690,18 +1686,22 @@ class LanceTable(Table): self, query: Query, batch_size: Optional[int] = None ) -> pa.RecordBatchReader: ds = self.to_lance() - return ds.scanner( - columns=query.columns, - filter=query.filter, - prefilter=query.prefilter, - nearest={ + nearest = None + if len(query.vector) > 0: + nearest = { "column": query.vector_column, "q": query.vector, "k": query.k, "metric": query.metric, "nprobes": query.nprobes, "refine_factor": query.refine_factor, - }, + } + return ds.scanner( + columns=query.columns, + limit=query.k, + filter=query.filter, + prefilter=query.prefilter, + nearest=nearest, full_text_query=query.full_text_query, with_row_id=query.with_row_id, batch_size=batch_size, @@ -2126,7 +2126,7 @@ class AsyncTable: column: str, *, replace: Optional[bool] = None, - config: Optional[Union[IvfPq, BTree, Bitmap, LabelList]] = None, + config: Optional[Union[IvfPq, BTree, Bitmap, LabelList, FTS]] = None, ): """Create an index to speed up queries diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index f4c7cd1c..9cfda85a 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -15,6 +15,7 @@ import random from unittest import mock import lancedb as ldb +from lancedb.index import FTS import numpy as np import pandas as pd import pytest @@ -60,6 +61,43 @@ def table(tmp_path) -> ldb.table.LanceTable: return table +@pytest.fixture +async def async_table(tmp_path) -> ldb.table.AsyncTable: + db = await ldb.connect_async(tmp_path) + vectors = [np.random.randn(128) for _ in range(100)] + + nouns = ("puppy", "car", "rabbit", "girl", "monkey") + verbs = ("runs", "hits", "jumps", "drives", "barfs") + adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") + adj = ("adorable", "clueless", "dirty", "odd", "stupid") + text = [ + " ".join( + [ + nouns[random.randrange(0, 5)], + verbs[random.randrange(0, 5)], + adv[random.randrange(0, 5)], + adj[random.randrange(0, 5)], + ] + ) + for _ in range(100) + ] + count = [random.randint(1, 10000) for _ in range(100)] + table = await db.create_table( + "test", + data=pd.DataFrame( + { + "vector": vectors, + "id": [i % 2 for i in range(100)], + "text": text, + "text2": text, + "nested": [{"text": t} for t in text], + "count": count, + } + ), + ) + return table + + def test_create_index(tmp_path): index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) assert isinstance(index, tantivy.Index) @@ -91,17 +129,23 @@ def test_search_index(tmp_path, table): index = ldb.fts.create_index(str(tmp_path / "index"), ["text"]) ldb.fts.populate_index(index, table, ["text"]) index.reload() - results = ldb.fts.search_index(index, query="puppy", limit=10) + results = ldb.fts.search_index(index, query="puppy", limit=5) assert len(results) == 2 - assert len(results[0]) == 10 # row_ids - assert len(results[1]) == 10 # _distance + assert len(results[0]) == 5 # row_ids + assert len(results[1]) == 5 # _score @pytest.mark.parametrize("use_tantivy", [True, False]) def test_search_fts(table, use_tantivy): table.create_fts_index("text", use_tantivy=use_tantivy) - results = table.search("puppy").limit(10).to_list() - assert len(results) == 10 + results = table.search("puppy").limit(5).to_list() + assert len(results) == 5 + + +async def test_search_fts_async(async_table): + await async_table.create_index("text", config=FTS()) + results = await async_table.query().nearest_to_text("puppy").limit(5).to_list() + assert len(results) == 5 def test_search_ordering_field_index_table(tmp_path, table): @@ -125,11 +169,11 @@ def test_search_ordering_field_index(tmp_path, table): ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"]) index.reload() results = ldb.fts.search_index( - index, query="puppy", limit=10, ordering_field="count" + index, query="puppy", limit=5, ordering_field="count" ) assert len(results) == 2 - assert len(results[0]) == 10 # row_ids - assert len(results[1]) == 10 # _distance + assert len(results[0]) == 5 # row_ids + assert len(results[1]) == 5 # _distance rows = table.to_lance().take(results[0]).to_pylist() for r in rows: @@ -140,8 +184,8 @@ def test_search_ordering_field_index(tmp_path, table): @pytest.mark.parametrize("use_tantivy", [True, False]) def test_create_index_from_table(tmp_path, table, use_tantivy): table.create_fts_index("text", use_tantivy=use_tantivy) - df = table.search("puppy").limit(10).select(["text"]).to_pandas() - assert len(df) <= 10 + df = table.search("puppy").limit(5).select(["text"]).to_pandas() + assert len(df) <= 5 assert "text" in df.columns # Check whether it can be updated @@ -167,8 +211,8 @@ def test_create_index_from_table(tmp_path, table, use_tantivy): def test_create_index_multiple_columns(tmp_path, table): table.create_fts_index(["text", "text2"], use_tantivy=True) - df = table.search("puppy").limit(10).to_pandas() - assert len(df) == 10 + df = table.search("puppy").limit(5).to_pandas() + assert len(df) == 5 assert "text" in df.columns assert "text2" in df.columns @@ -176,14 +220,14 @@ def test_create_index_multiple_columns(tmp_path, table): def test_empty_rs(tmp_path, table, mocker): table.create_fts_index(["text", "text2"], use_tantivy=True) mocker.patch("lancedb.fts.search_index", return_value=([], [])) - df = table.search("puppy").limit(10).to_pandas() + df = table.search("puppy").limit(5).to_pandas() assert len(df) == 0 def test_nested_schema(tmp_path, table): table.create_fts_index("nested.text", use_tantivy=True) - rs = table.search("puppy").limit(10).to_list() - assert len(rs) == 10 + rs = table.search("puppy").limit(5).to_list() + assert len(rs) == 5 @pytest.mark.parametrize("use_tantivy", [True, False]) diff --git a/python/python/tests/test_s3.py b/python/python/tests/test_s3.py index 2b6ed38a..85b72749 100644 --- a/python/python/tests/test_s3.py +++ b/python/python/tests/test_s3.py @@ -251,7 +251,8 @@ def test_s3_dynamodb_sync(s3_bucket: str, commit_table: str, monkeypatch): # FTS indices should error since they are not supported yet. with pytest.raises( - NotImplementedError, match="Full-text search is not supported on object stores." + NotImplementedError, + match="Full-text search is only supported on the local filesystem", ): table.create_fts_index("x") diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 0d6beeb4..6ca2f5f1 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -28,7 +28,7 @@ from pydantic import BaseModel class MockDB: def __init__(self, uri: Path): - self.uri = uri + self.uri = str(uri) self.read_consistency_interval = None @functools.cached_property diff --git a/python/src/query.rs b/python/src/query.rs index 471f686b..f88e60b4 100644 --- a/python/src/query.rs +++ b/python/src/query.rs @@ -15,17 +15,20 @@ use arrow::array::make_array; use arrow::array::ArrayData; use arrow::pyarrow::FromPyArrow; +use lancedb::index::scalar::FullTextSearchQuery; use lancedb::query::QueryExecutionOptions; use lancedb::query::{ ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery, }; use pyo3::exceptions::PyRuntimeError; -use pyo3::pyclass; +use pyo3::prelude::{PyAnyMethods, PyDictMethods}; use pyo3::pymethods; +use pyo3::types::PyDict; use pyo3::Bound; use pyo3::PyAny; use pyo3::PyRef; use pyo3::PyResult; +use pyo3::{pyclass, PyErr}; use pyo3_asyncio_0_21::tokio::future_into_py; use crate::arrow::RecordBatchStream; @@ -68,6 +71,24 @@ impl Query { Ok(VectorQuery { inner }) } + pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<()> { + let query_text = query + .get_item("query")? + .ok_or(PyErr::new::( + "Query text is required for nearest_to_text", + ))? + .extract::()?; + let columns = query + .get_item("columns")? + .map(|columns| columns.extract::>()) + .transpose()?; + + let fts_query = FullTextSearchQuery::new(query_text).columns(columns); + self.inner = self.inner.clone().full_text_search(fts_query); + + Ok(()) + } + pub fn execute( self_: PyRef<'_, Self>, max_batch_length: Option, From 09ce6c5bb5649dbfb9899649e341c92bcf075e8e Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Fri, 16 Aug 2024 21:30:45 +0530 Subject: [PATCH 08/34] docs: add vector search example (#1543) --- docs/mkdocs.yml | 2 + docs/src/assets/open_hf_space.svg | 22 ++++++ .../examples/python_examples/multimodal.md | 2 +- .../examples/python_examples/vector_search.md | 78 +++++++++++++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 docs/src/assets/open_hf_space.svg create mode 100644 docs/src/examples/python_examples/vector_search.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index feb2a318..9059ef90 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -144,6 +144,7 @@ nav: - Build From Scratch: examples/python_examples/build_from_scratch.md - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md + - Vector Search: examples/python_examples/vector_search.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb @@ -229,6 +230,7 @@ nav: - Build From Scratch: examples/python_examples/build_from_scratch.md - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md + - Vector Search: examples/python_examples/vector_search.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb diff --git a/docs/src/assets/open_hf_space.svg b/docs/src/assets/open_hf_space.svg new file mode 100644 index 00000000..b5d34c54 --- /dev/null +++ b/docs/src/assets/open_hf_space.svg @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/src/examples/python_examples/multimodal.md b/docs/src/examples/python_examples/multimodal.md index 6f1de8ed..2409fb2f 100644 --- a/docs/src/examples/python_examples/multimodal.md +++ b/docs/src/examples/python_examples/multimodal.md @@ -13,7 +13,7 @@ Unlock the power of multimodal search with LanceDB, enabling efficient vector-ba | **Multimodal CLIP: DiffusionDB 🌐πŸ’₯** | Revolutionize search with Multimodal CLIP and DiffusionDB, combining text and image understanding for a new dimension of discovery! πŸ”“ | [![GitHub](../../assets/github.svg)][Clip_diffusionDB_github]
[![Open In Collab](../../assets/colab.svg)][Clip_diffusionDB_colab]
[![Python](../../assets/python.svg)][Clip_diffusionDB_python]
[![Ghost](../../assets/ghost.svg)][Clip_diffusionDB_ghost] | | **Multimodal CLIP: Youtube Videos πŸ“ΉπŸ‘€** | Search Youtube videos using Multimodal CLIP, finding relevant content with ease and accuracy! 🎯 | [![Github](../../assets/github.svg)][Clip_youtube_github]
[![Open In Collab](../../assets/colab.svg)][Clip_youtube_colab]
[![Python](../../assets/python.svg)][Clip_youtube_python]
[![Ghost](../../assets/ghost.svg)][Clip_youtube_python] | | **Multimodal Image + Text Search πŸ“ΈπŸ”** | Discover relevant documents and images with a single query, using LanceDB's multimodal search capabilities to bridge the gap between text and visuals! πŸŒ‰ | [![GitHub](../../assets/github.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search)
[![Open In Collab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.ipynb)
[![Python](../../assets/python.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.py)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/) | -| **Cambrian-1: Vision-Centric Image Exploration πŸ”πŸ‘€** | Dive into vision-centric exploration of images with Cambrian-1, powered by LanceDB's multimodal search to uncover new insights! πŸ”Ž | [![GitHub](../../assets/github.svg)](https://www.kaggle.com/code/prasantdixit/cambrian-1-vision-centric-exploration-of-images/)
[![Open In Collab](../../assets/colab.svg)]()
[![Python](../../assets/python.svg)]()
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/cambrian-1-vision-centric-exploration/) | +| **Cambrian-1: Vision-Centric Image Exploration πŸ”πŸ‘€** | Dive into vision-centric exploration of images with Cambrian-1, powered by LanceDB's multimodal search to uncover new insights! πŸ”Ž | [![Kaggle](https://img.shields.io/badge/Kaggle-035a7d?style=for-the-badge&logo=kaggle&logoColor=white)](https://www.kaggle.com/code/prasantdixit/cambrian-1-vision-centric-exploration-of-images/)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/cambrian-1-vision-centric-exploration/) | [Clip_diffusionDB_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_clip_diffusiondb diff --git a/docs/src/examples/python_examples/vector_search.md b/docs/src/examples/python_examples/vector_search.md new file mode 100644 index 00000000..67000d39 --- /dev/null +++ b/docs/src/examples/python_examples/vector_search.md @@ -0,0 +1,78 @@ +**πŸ” Vector Search: Unlock Efficient Document Retrieval πŸ”“** +==================================================================== + +**Introduction** + +Unlock the power of vector search with LanceDB, a cutting-edge solution for efficient vector-based document retrieval πŸ“Š. Input text queries to find the most relevant documents from your corpus, and discover a new world of possibilities with our inbuilt hybrid search features 🌐. + +| **Vector Search** | **Description** | **Links** | +|:-----------------|:---------------|:---------| +| **Inbuilt Hybrid Search πŸ”„** | Combine the power of traditional search algorithms with LanceDB's vector-based search for a robust and efficient search experience πŸ“Š | [![Github](../../assets/github.svg)][inbuilt_hybrid_search_github]
[![Open In Collab](../../assets/colab.svg)][inbuilt_hybrid_search_colab] | +| **Hybrid Search with BM25 and LanceDB πŸ’‘** | Synergizes BM25's keyword-focused precision (term frequency, document length normalization, bias-free retrieval) with LanceDB's semantic understanding (contextual analysis, query intent alignment) for nuanced search results in complex datasets πŸ“ˆ | [![Github](../../assets/github.svg)][BM25_github]
[![Open In Collab](../../assets/colab.svg)][BM25_colab]
[![Ghost](../../assets/ghost.svg)][BM25_ghost] | +| **NER-powered Semantic Search πŸ”Ž** | Unlock contextual understanding with Named Entity Recognition (NER) methods: Dictionary-Based, Rule-Based, and Deep Learning-Based, to accurately identify and extract entities, enabling precise semantic search results πŸ—‚οΈ | [![Github](../../assets/github.svg)][NER_github]
[![Open In Collab](../../assets/colab.svg)][NER_colab]
[![Ghost](../../assets/ghost.svg)][NER_ghost]| +| **Audio Similarity Search using Vector Embeddings 🎡** | Create vector embeddings of audio files to find similar audio content, enabling efficient audio similarity search and retrieval in LanceDB's vector store πŸ“» |[![Github](../../assets/github.svg)][audio_search_github]
[![Open In Collab](../../assets/colab.svg)][audio_search_colab]
[![Python](../../assets/python.svg)][audio_search_python]| +| **LanceDB Embeddings API: Multi-lingual Semantic Search 🌎** | Build a universal semantic search table with LanceDB's Embeddings API, supporting multiple languages (e.g., English, French) using cohere's multi-lingual model, for accurate cross-lingual search results πŸ“„ | [![Github](../../assets/github.svg)][mls_github]
[![Open In Collab](../../assets/colab.svg)][mls_colab]
[![Python](../../assets/python.svg)][mls_python] | +| **Facial Recognition: Face Embeddings πŸ€–** | Detect, crop, and embed faces using Facenet, then store and query face embeddings in LanceDB for efficient facial recognition and top-K matching results πŸ‘₯ | [![Github](../../assets/github.svg)][fr_github]
[![Open In Collab](../../assets/colab.svg)][fr_colab] | +| **Sentiment Analysis: Hotel Reviews 🏨** | Analyze customer sentiments towards the hotel industry using BERT models, storing sentiment labels, scores, and embeddings in LanceDB, enabling queries on customer opinions and potential areas for improvement πŸ’¬ | [![Github](../../assets/github.svg)][sentiment_analysis_github]
[![Open In Collab](../../assets/colab.svg)][sentiment_analysis_colab]
[![Ghost](../../assets/ghost.svg)][sentiment_analysis_ghost] | +| **Vector Arithmetic with LanceDB βš–οΈ** | Unlock powerful semantic search capabilities by performing vector arithmetic on embeddings, enabling complex relationships and nuances in data to be captured, and simplifying the process of retrieving semantically similar results πŸ“Š | [![Github](../../assets/github.svg)][arithmetic_github]
[![Open In Collab](../../assets/colab.svg)][arithmetic_colab]
[![Ghost](../../assets/ghost.svg)][arithmetic_ghost] | +| **Imagebind Demo πŸ–ΌοΈ** | Explore the multi-modal capabilities of Imagebind through a Gradio app, leveraging LanceDB API for seamless image search and retrieval experiences πŸ“Έ | [![Github](../../assets/github.svg)][imagebind_github]
[![Open in Spaces](../../assets/open_hf_space.svg)][imagebind_huggingface] | +| **Search Engine using SAM & CLIP πŸ”** | Build a search engine within an image using SAM and CLIP models, enabling object-level search and retrieval, with LanceDB indexing and search capabilities to find the closest match between image embeddings and user queries πŸ“Έ | [![Github](../../assets/github.svg)][swi_github]
[![Open In Collab](../../assets/colab.svg)][swi_colab]
[![Ghost](../../assets/ghost.svg)][swi_ghost] | +| **Zero Shot Object Localization and Detection with CLIP πŸ”Ž** | Perform object detection on images using OpenAI's CLIP, enabling zero-shot localization and detection of objects, with capabilities to split images into patches, parse with CLIP, and plot bounding boxes πŸ“Š | [![Github](../../assets/github.svg)][zsod_github]
[![Open In Collab](../../assets/colab.svg)][zsod_colab] | +| **Accelerate Vector Search with OpenVINO πŸš€** | Boost vector search applications using OpenVINO, achieving significant speedups with CLIP for text-to-image and image-to-image searching, through PyTorch model optimization, FP16 and INT8 format conversion, and quantization with OpenVINO NNCF πŸ“ˆ | [![Github](../../assets/github.svg)][openvino_github]
[![Open In Collab](../../assets/colab.svg)][openvino_colab]
[![Ghost](../../assets/ghost.svg)][openvino_ghost] | +| **Zero-Shot Image Classification with CLIP and LanceDB πŸ“Έ** | Achieve zero-shot image classification using CLIP and LanceDB, enabling models to classify images without prior training on specific use cases, unlocking flexible and adaptable image classification capabilities πŸ”“ | [![Github](../../assets/github.svg)][zsic_github]
[![Open In Collab](../../assets/colab.svg)][zsic_colab]
[![Ghost](../../assets/ghost.svg)][zsic_ghost] | + + + + +[inbuilt_hybrid_search_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Inbuilt-Hybrid-Search +[inbuilt_hybrid_search_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Inbuilt-Hybrid-Search/Inbuilt_Hybrid_Search_with_LanceDB.ipynb + +[BM25_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Hybrid_search_bm25_lancedb +[BM25_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Hybrid_search_bm25_lancedb/main.ipynb +[BM25_ghost]: https://blog.lancedb.com/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6 + +[NER_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/NER-powered-Semantic-Search +[NER_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/NER-powered-Semantic-Search/NER_powered_Semantic_Search_with_LanceDB.ipynb +[NER_ghost]: https://blog.lancedb.com/ner-powered-semantic-search-using-lancedb-51051dc3e493 + +[audio_search_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/audio_search +[audio_search_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/audio_search/main.ipynb +[audio_search_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/audio_search/main.py + +[mls_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multi-lingual-wiki-qa +[mls_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multi-lingual-wiki-qa/main.ipynb +[mls_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multi-lingual-wiki-qa/main.py + +[fr_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/facial_recognition +[fr_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/facial_recognition/main.ipynb + +[sentiment_analysis_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Sentiment-Analysis-Analyse-Hotel-Reviews +[sentiment_analysis_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Sentiment-Analysis-Analyse-Hotel-Reviews/Sentiment_Analysis_using_LanceDB.ipynb +[sentiment_analysis_ghost]: https://blog.lancedb.com/sentiment-analysis-using-lancedb-2da3cb1e3fa6 + +[arithmetic_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Vector-Arithmetic-with-LanceDB +[arithmetic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Vector-Arithmetic-with-LanceDB/main.ipynb +[arithmetic_ghost]: https://blog.lancedb.com/vector-arithmetic-with-lancedb-an-intro-to-vector-embeddings/ + +[imagebind_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/imagebind_demo +[imagebind_huggingface]: https://huggingface.co/spaces/raghavd99/imagebind2 + +[swi_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/search-within-images-with-sam-and-clip +[swi_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/search-within-images-with-sam-and-clip/main.ipynb +[swi_ghost]: https://blog.lancedb.com/search-within-an-image-331b54e4285e + +[zsod_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/zero-shot-object-detection-CLIP +[zsod_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/zero-shot-object-detection-CLIP/zero_shot_object_detection_clip.ipynb + +[openvino_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Accelerate-Vector-Search-Applications-Using-OpenVINO +[openvino_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb +[openvino_ghost]: https://blog.lancedb.com/accelerate-vector-search-applications-using-openvino-lancedb/ + +[zsic_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/zero-shot-image-classification +[zsic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/zero-shot-image-classification/main.ipynb +[zsic_ghost]: https://blog.lancedb.com/zero-shot-image-classification-with-vector-search/ + + + + + From 5857cb4c6e169f2147e30495f7c43df7ba453188 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Fri, 16 Aug 2024 18:48:29 -0700 Subject: [PATCH 09/34] docs: add a section to describe scalar index (#1495) --- docs/mkdocs.yml | 20 +++--- docs/src/guides/scalar_index.md | 108 ++++++++++++++++++++++++++++++++ docs/test/md_testing.py | 2 + python/python/lancedb/table.py | 6 +- 4 files changed, 125 insertions(+), 11 deletions(-) create mode 100644 docs/src/guides/scalar_index.md mode change 100644 => 100755 docs/test/md_testing.py diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 9059ef90..bae287f4 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -58,7 +58,7 @@ plugins: - https://pandas.pydata.org/docs/objects.inv - mkdocs-jupyter - render_swagger: - allow_arbitrary_locations : true + allow_arbitrary_locations: true markdown_extensions: - admonition @@ -89,9 +89,10 @@ nav: - Data management: concepts/data_management.md - πŸ”¨ Guides: - Working with tables: guides/tables.md - - Building an ANN index: ann_indexes.md + - Building a vector index: ann_indexes.md - Vector Search: search.md - Full-text search: fts.md + - Building a scalar index: guides/scalar_index.md - Hybrid search: - Overview: hybrid_search/hybrid_search.md - Comparing Rerankers: hybrid_search/eval.md @@ -128,12 +129,12 @@ nav: - Polars: python/polars_arrow.md - DuckDB: python/duckdb.md - LangChain: - - LangChain πŸ”—: integrations/langchain.md - - LangChain demo: notebooks/langchain_demo.ipynb - - LangChain JS/TS πŸ”—: https://js.langchain.com/docs/integrations/vectorstores/lancedb + - LangChain πŸ”—: integrations/langchain.md + - LangChain demo: notebooks/langchain_demo.ipynb + - LangChain JS/TS πŸ”—: https://js.langchain.com/docs/integrations/vectorstores/lancedb - LlamaIndex πŸ¦™: - - LlamaIndex docs: integrations/llamaIndex.md - - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb + - LlamaIndex docs: integrations/llamaIndex.md + - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb - Pydantic: python/pydantic.md - Voxel51: integrations/voxel51.md - PromptTools: integrations/prompttools.md @@ -145,7 +146,7 @@ nav: - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - - Miscellaneous: + - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Multimodal search using CLIP: notebooks/multimodal_search.ipynb @@ -182,6 +183,7 @@ nav: - Building an ANN index: ann_indexes.md - Vector Search: search.md - Full-text search: fts.md + - Building a scalar index: guides/scalar_index.md - Hybrid search: - Overview: hybrid_search/hybrid_search.md - Comparing Rerankers: hybrid_search/eval.md @@ -231,7 +233,7 @@ nav: - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - - Miscellaneous: + - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Multimodal search using CLIP: notebooks/multimodal_search.ipynb diff --git a/docs/src/guides/scalar_index.md b/docs/src/guides/scalar_index.md new file mode 100644 index 00000000..7f3322af --- /dev/null +++ b/docs/src/guides/scalar_index.md @@ -0,0 +1,108 @@ +# Building Scalar Index + +Similar to many SQL databases, LanceDB supports several types of Scalar indices to accelerate search +over scalar columns. + +- `BTREE`: The most common type is BTREE. This index is inspired by the btree data structure + although only the first few layers of the btree are cached in memory. + It will perform well on columns with a large number of unique values and few rows per value. +- `BITMAP`: this index stores a bitmap for each unique value in the column. + This index is useful for columns with a finite number of unique values and many rows per value. + For example, columns that represent "categories", "labels", or "tags" +- `LABEL_LIST`: a special index that is used to index list columns whose values have a finite set of possibilities. + For example, a column that contains lists of tags (e.g. `["tag1", "tag2", "tag3"]`) can be indexed with a `LABEL_LIST` index. + +| Data Type | Filter | Index Type | +| --------------------------------------------------------------- | ----------------------------------------- | ------------ | +| Numeric, String, Temporal | `<`, `=`, `>`, `in`, `between`, `is null` | `BTREE` | +| Boolean, numbers or strings with fewer than 1,000 unique values | `<`, `=`, `>`, `in`, `between`, `is null` | `BITMAP` | +| List of low cardinality of numbers or strings | `array_has_any`, `array_has_all` | `LABEL_LIST` | + +=== "Python" + + ```python + import lancedb + books = [ + {"book_id": 1, "publisher": "plenty of books", "tags": ["fantasy", "adventure"]}, + {"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]}, + {"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]} + ] + + db = lancedb.connect("./db") + table = db.create_table("books", books) + table.create_scalar_index("book_id") # BTree by default + table.create_scalar_index("publisher", index_type="BITMAP") + ``` + +=== "Typescript" + + === "@lancedb/lancedb" + + ```js + const db = await lancedb.connect("data"); + const tbl = await db.openTable("my_vectors"); + + await tbl.create_index("book_id"); + await tlb.create_index("publisher", { config: lancedb.Index.bitmap() }) + ``` + +For example, the following scan will be faster if the column `my_col` has a scalar index: + +=== "Python" + + ```python + import lancedb + + table = db.open_table("books") + my_df = table.search().where("book_id = 2").to_pandas() + ``` + +=== "Typescript" + + === "@lancedb/lancedb" + + ```js + const db = await lancedb.connect("data"); + const tbl = await db.openTable("books"); + + await tbl + .query() + .where("book_id = 2") + .limit(10) + .toArray(); + ``` + +Scalar indices can also speed up scans containing a vector search or full text search, and a prefilter: + +=== "Python" + + ```python + import lancedb + + data = [ + {"book_id": 1, "vector": [1, 2]}, + {"book_id": 2, "vector": [3, 4]}, + {"book_id": 3, "vector": [5, 6]} + ] + table = db.create_table("book_with_embeddings", data) + + ( + table.search([1, 2]) + .where("book_id != 3", prefilter=True) + .to_pandas() + ) + ``` + +=== "Typescript" + + === "@lancedb/lancedb" + + ```js + const db = await lancedb.connect("data/lance"); + const tbl = await db.openTable("book_with_embeddings"); + + await tbl.search(Array(1536).fill(1.2)) + .where("book_id != 3") // prefilter is default behavior. + .limit(10) + .toArray(); + ``` diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py old mode 100644 new mode 100755 index 8bdce0fd..7f2f3a99 --- a/docs/test/md_testing.py +++ b/docs/test/md_testing.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import glob from typing import Iterator, List from pathlib import Path diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 6f89e0f7..26ab53a1 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -339,9 +339,9 @@ class Table(ABC): def create_scalar_index( self, column: str, - index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", *, replace: bool = True, + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", ): """Create a scalar index on a column. @@ -391,6 +391,8 @@ class Table(ABC): or string column. replace : bool, default True Replace the existing index if it exists. + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE" + The type of index to create. Examples -------- @@ -1232,9 +1234,9 @@ class LanceTable(Table): def create_scalar_index( self, column: str, - index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", *, replace: bool = True, + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", ): self._dataset_mut.create_scalar_index( column, index_type=index_type, replace=replace From 21014cab452a486e4e9ca3e9c6a38ec120ff4679 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Sat, 17 Aug 2024 12:35:33 +0530 Subject: [PATCH 10/34] docs: add chatbot example and improve quality of other examples (#1544) --- docs/mkdocs.yml | 6 ++- .../python_examples/build_from_scratch.md | 6 +-- docs/src/examples/python_examples/chatbot.md | 41 +++++++++++++++++++ .../examples/python_examples/multimodal.md | 4 +- docs/src/examples/python_examples/rag.md | 4 +- .../examples/python_examples/vector_search.md | 8 ++-- 6 files changed, 57 insertions(+), 12 deletions(-) create mode 100644 docs/src/examples/python_examples/chatbot.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index bae287f4..024181fd 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -146,7 +146,8 @@ nav: - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - - Miscellaneous: + - Chatbot: examples/python_examples/chatbot.md + - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Multimodal search using CLIP: notebooks/multimodal_search.ipynb @@ -233,7 +234,8 @@ nav: - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - - Miscellaneous: + - Chatbot: examples/python_examples/chatbot.md + - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Multimodal search using CLIP: notebooks/multimodal_search.ipynb diff --git a/docs/src/examples/python_examples/build_from_scratch.md b/docs/src/examples/python_examples/build_from_scratch.md index f7469726..65e21af4 100644 --- a/docs/src/examples/python_examples/build_from_scratch.md +++ b/docs/src/examples/python_examples/build_from_scratch.md @@ -1,8 +1,8 @@ -# Build from Scratch with LanceDB πŸš€ +# **Build from Scratch with LanceDB πŸ› οΈπŸš€** -Start building your GenAI applications from the ground up using LanceDB's efficient vector-based document retrieval capabilities! πŸ“„ +Start building your GenAI applications from the ground up using LanceDB's efficient vector-based document retrieval capabilities! πŸ“‘ -#### Get Started in Minutes ⏱️ +**Get Started in Minutes ⏱️** These examples provide a solid foundation for building your own GenAI applications using LanceDB. Jump from idea to proof of concept quickly with applied examples. Get started and see what you can create! πŸ’» diff --git a/docs/src/examples/python_examples/chatbot.md b/docs/src/examples/python_examples/chatbot.md new file mode 100644 index 00000000..a16848a6 --- /dev/null +++ b/docs/src/examples/python_examples/chatbot.md @@ -0,0 +1,41 @@ +**Chatbot Application with LanceDB πŸ€–** +==================================================================== + + Create an innovative chatbot application that utilizes LanceDB for efficient vector-based response generation! 🌐✨ + +**Introduction πŸ‘‹βœ¨** + + Users can input their queries, allowing the chatbot to retrieve relevant context seamlessly. πŸ”πŸ“š This enables the generation of coherent and context-aware replies that enhance user experience. 🌟🀝 Dive into the world of advanced conversational AI and streamline interactions with powerful data management! πŸš€πŸ’‘ + + +| **Chatbot** | **Description** | **Links** | +|:----------------|:-----------------|:-----------| +| **Databricks DBRX Website Bot ⚑️** | Unlock magical conversations with the Hogwarts chatbot, powered by Open-source RAG, DBRX, LanceDB, LLama-index, and Hugging Face Embeddings, delivering enchanting user experiences and spellbinding interactions ✨ | [![GitHub](../../assets/github.svg)][databricks_github]
[![Python](../../assets/python.svg)][databricks_python] | +| **CLI SDK Manual Chatbot Locally πŸ’»** | CLI chatbot for SDK/hardware documents, powered by Local RAG, LLama3, Ollama, LanceDB, and Openhermes Embeddings, built with Phidata Assistant and Knowledge Base for instant technical support πŸ€– | [![GitHub](../../assets/github.svg)][clisdk_github]
[![Python](../../assets/python.svg)][clisdk_python] | +| **Youtube Transcript Search QA Bot πŸ“Ή** | Unlock the power of YouTube transcripts with a Q&A bot, leveraging natural language search and LanceDB for effortless data management and instant answers πŸ’¬ | [![GitHub](../../assets/github.svg)][youtube_github]
[![Open In Collab](../../assets/colab.svg)][youtube_colab]
[![Python](../../assets/python.svg)][youtube_python] | +| **Code Documentation Q&A Bot with LangChain πŸ€–** | Revolutionize code documentation with a Q&A bot, powered by LangChain and LanceDB, allowing effortless querying of documentation using natural language, demonstrated with Numpy 1.26 docs πŸ“š | [![GitHub](../../assets/github.svg)][docs_github]
[![Open In Collab](../../assets/colab.svg)][docs_colab]
[![Python](../../assets/python.svg)][docs_python] | +| **Context-aware Chatbot using Llama 2 & LanceDB πŸ€–** | Experience the future of conversational AI with a context-aware chatbot, powered by Llama 2, LanceDB, and LangChain, enabling intuitive and meaningful conversations with your data πŸ“šπŸ’¬ | [![GitHub](../../assets/github.svg)][aware_github]
[![Open In Collab](../../assets/colab.svg)][aware_colab]
[![Ghost](../../assets/ghost.svg)][aware_ghost] | +| **Chat with csv using Hybrid Search πŸ“Š** | Revolutionize data interaction with a chat application that harnesses LanceDB's hybrid search capabilities to converse with CSV and Excel files, enabling efficient and scalable data exploration and analysis πŸš€ | [![GitHub](../../assets/github.svg)][csv_github]
[![Open In Collab](../../assets/colab.svg)][csv_colab]
[![Ghost](../../assets/ghost.svg)][csv_ghost] | + + +[databricks_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/databricks_DBRX_website_bot +[databricks_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/databricks_DBRX_website_bot/main.py + +[clisdk_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/CLI-SDK-Manual-Chatbot-Locally +[clisdk_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/CLI-SDK-Manual-Chatbot-Locally/assistant.py + +[youtube_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Youtube-Search-QA-Bot +[youtube_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Youtube-Search-QA-Bot/main.ipynb +[youtube_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Youtube-Search-QA-Bot/main.py + +[docs_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Code-Documentation-QA-Bot +[docs_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Code-Documentation-QA-Bot/main.ipynb +[docs_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Code-Documentation-QA-Bot/main.py + +[aware_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/chatbot_using_Llama2_&_lanceDB +[aware_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/chatbot_using_Llama2_&_lanceDB/main.ipynb +[aware_ghost]: https://blog.lancedb.com/context-aware-chatbot-using-llama-2-lancedb-as-vector-database-4d771d95c755 + +[csv_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/Chat_with_csv_file +[csv_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Chat_with_csv_file/main.ipynb +[csv_ghost]: https://blog.lancedb.com/p/d8c71df4-e55f-479a-819e-cde13354a6a3/ diff --git a/docs/src/examples/python_examples/multimodal.md b/docs/src/examples/python_examples/multimodal.md index 2409fb2f..376d7864 100644 --- a/docs/src/examples/python_examples/multimodal.md +++ b/docs/src/examples/python_examples/multimodal.md @@ -1,8 +1,8 @@ -# Multimodal Search with LanceDB πŸ”πŸ’‘ +# **Multimodal Search with LanceDB πŸ€Ήβ€β™‚οΈπŸ”** Experience the future of search with LanceDB's multimodal capabilities. Combine text and image queries to find the most relevant results in your corpus and unlock new possibilities! πŸ”“πŸ’‘ -#### Explore the Future of Search πŸš€ +**Explore the Future of Search πŸš€** Unlock the power of multimodal search with LanceDB, enabling efficient vector-based retrieval of text and image data! πŸ“ŠπŸ’» diff --git a/docs/src/examples/python_examples/rag.md b/docs/src/examples/python_examples/rag.md index c2c41d0c..02339515 100644 --- a/docs/src/examples/python_examples/rag.md +++ b/docs/src/examples/python_examples/rag.md @@ -1,10 +1,10 @@ -**πŸ”πŸ’‘ RAG: Revolutionize Information Retrieval with LanceDB πŸ”“** +**RAG: Revolutionize Information Retrieval with LanceDB πŸ”“πŸ§** ==================================================================== Unlock the full potential of Retrieval-Augmented Generation (RAG) with LanceDB, the ultimate solution for efficient vector-based information retrieval πŸ“Š. Input text queries and retrieve relevant documents with lightning-fast speed ⚑️ and accuracy βœ…. Generate comprehensive answers by combining retrieved information, uncovering new insights πŸ” and connections. -### Experience the Future of Search πŸ”„ +**Experience the Future of Search πŸ”„** Experience the future of search with RAG, transforming information retrieval and answer generation. Apply RAG to various industries, streamlining processes πŸ“ˆ, saving time ⏰, and resources πŸ’°. Stay ahead of the curve with innovative technology πŸ”, powered by LanceDB. Discover the power of RAG with LanceDB and transform your industry with innovative solutions πŸ’‘. diff --git a/docs/src/examples/python_examples/vector_search.md b/docs/src/examples/python_examples/vector_search.md index 67000d39..a7b91a16 100644 --- a/docs/src/examples/python_examples/vector_search.md +++ b/docs/src/examples/python_examples/vector_search.md @@ -1,10 +1,12 @@ -**πŸ” Vector Search: Unlock Efficient Document Retrieval πŸ”“** +**Vector Search: Unlock Efficient Document Retrieval πŸ”“πŸ‘€** ==================================================================== -**Introduction** - Unlock the power of vector search with LanceDB, a cutting-edge solution for efficient vector-based document retrieval πŸ“Š. Input text queries to find the most relevant documents from your corpus, and discover a new world of possibilities with our inbuilt hybrid search features 🌐. +**Unlock the Future of SearchπŸ”** + +Experience the transformative power of vector search with LanceDB. Discover, analyze, and retrieve documents with unprecedented efficiency and accuracy. πŸ’‘ + | **Vector Search** | **Description** | **Links** | |:-----------------|:---------------|:---------| | **Inbuilt Hybrid Search πŸ”„** | Combine the power of traditional search algorithms with LanceDB's vector-based search for a robust and efficient search experience πŸ“Š | [![Github](../../assets/github.svg)][inbuilt_hybrid_search_github]
[![Open In Collab](../../assets/colab.svg)][inbuilt_hybrid_search_colab] | From 85bb7e54e42efc1d7e7274758e93fd34e0eda477 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Mon, 19 Aug 2024 07:48:23 +0530 Subject: [PATCH 11/34] docs: missing griffe dependency for mkdocs deployment (#1545) --- docs/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index bc463706..77316605 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,7 @@ mkdocs==1.5.3 mkdocs-jupyter==0.24.1 mkdocs-material==9.5.3 -mkdocstrings[python]==0.20.0 +mkdocstrings[python]==0.25.2 +griffe mkdocs-render-swagger-plugin pydantic From 7d65dd97cf713618547ed289d8f1cd56221d0018 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Wed, 21 Aug 2024 12:26:52 +0530 Subject: [PATCH 12/34] chore(python): update Colbert architecture and minor improvements (#1547) - Update ColBertReranker architecture: The current implementation doesn't use the right arch. This PR uses the implementation in Rerankers library. Fixes https://github.com/lancedb/lancedb/issues/1546 Benchmark diff (hit rate): Hybrid - 91 vs 87 reranked vector - 85 vs 80 - Reranking in FTS is basically disabled in main after last week's FTS updates. I think there's no blocker in supporting that? - Allow overriding accelerators: Most transformer based Rerankers and Embedding automatically select device. This PR allows overriding those settings by passing `device`. Fixes: https://github.com/lancedb/lancedb/issues/1487 --------- Co-authored-by: BubbleCal --- .../python/lancedb/embeddings/instructor.py | 1 + .../python/lancedb/embeddings/transformers.py | 6 +- python/python/lancedb/query.py | 8 ++- python/python/lancedb/rerankers/colbert.py | 63 +++---------------- .../python/lancedb/rerankers/cross_encoder.py | 3 +- python/python/tests/test_rerankers.py | 25 +++++--- 6 files changed, 37 insertions(+), 69 deletions(-) diff --git a/python/python/lancedb/embeddings/instructor.py b/python/python/lancedb/embeddings/instructor.py index 98206bc5..a6022be6 100644 --- a/python/python/lancedb/embeddings/instructor.py +++ b/python/python/lancedb/embeddings/instructor.py @@ -127,6 +127,7 @@ class InstructorEmbeddingFunction(TextEmbeddingFunction): batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, normalize_embeddings=self.normalize_embeddings, + device=self.device, ).tolist() return res diff --git a/python/python/lancedb/embeddings/transformers.py b/python/python/lancedb/embeddings/transformers.py index a20f27ff..dba5b161 100644 --- a/python/python/lancedb/embeddings/transformers.py +++ b/python/python/lancedb/embeddings/transformers.py @@ -44,6 +44,7 @@ class TransformersEmbeddingFunction(EmbeddingFunction): """ name: str = "colbert-ir/colbertv2.0" + device: str = "cpu" _tokenizer: Any = PrivateAttr() _model: Any = PrivateAttr() @@ -53,6 +54,7 @@ class TransformersEmbeddingFunction(EmbeddingFunction): transformers = attempt_import_or_raise("transformers") self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.name) self._model = transformers.AutoModel.from_pretrained(self.name) + self._model.to(self.device) if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat @@ -75,9 +77,9 @@ class TransformersEmbeddingFunction(EmbeddingFunction): for text in texts: encoding = self._tokenizer( text, return_tensors="pt", padding=True, truncation=True - ) + ).to(self.device) emb = self._model(**encoding).last_hidden_state.mean(dim=1).squeeze() - embedding.append(emb.detach().numpy()) + embedding.append(emb.tolist()) return embedding diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 874a606a..6c3c71bd 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -727,7 +727,10 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): vector=[], ) results = self._table._execute_query(query) - return results.read_all() + results = results.read_all() + if self._reranker is not None: + results = self._reranker.rerank_fts(self._query, results) + return results def tantivy_to_arrow(self) -> pa.Table: try: @@ -825,7 +828,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): LanceFtsQueryBuilder The LanceQueryBuilder object. """ - raise NotImplementedError("Reranking is not yet supported for FTS queries.") + self._reranker = reranker + return self class LanceEmptyQueryBuilder(LanceQueryBuilder): diff --git a/python/python/lancedb/rerankers/colbert.py b/python/python/lancedb/rerankers/colbert.py index 77ef58a1..5e8701b3 100644 --- a/python/python/lancedb/rerankers/colbert.py +++ b/python/python/lancedb/rerankers/colbert.py @@ -1,5 +1,3 @@ -from functools import cached_property - import pyarrow as pa from ..util import attempt_import_or_raise @@ -12,7 +10,7 @@ class ColbertReranker(Reranker): Parameters ---------- - model_name : str, default "colbert-ir/colbertv2.0" + model_name : str, default "colbert" (colbert-ir/colbert-v2.0) The name of the cross encoder model to use. column : str, default "text" The name of the column to use as input to the cross encoder model. @@ -22,41 +20,26 @@ class ColbertReranker(Reranker): def __init__( self, - model_name: str = "colbert-ir/colbertv2.0", + model_name: str = "colbert", column: str = "text", return_score="relevance", ): super().__init__(return_score) self.model_name = model_name self.column = column - self.torch = attempt_import_or_raise( - "torch" + rerankers = attempt_import_or_raise( + "rerankers" ) # import here for faster ops later + self.colbert = rerankers.Reranker(self.model_name, model_type="colbert") def _rerank(self, result_set: pa.Table, query: str): docs = result_set[self.column].to_pylist() + doc_ids = list(range(len(docs))) + result = self.colbert.rank(query, docs, doc_ids=doc_ids) - tokenizer, model = self._model + # get the scores of each document in the same order as the input + scores = [result.get_result_by_docid(i).score for i in doc_ids] - # Encode the query - query_encoding = tokenizer(query, return_tensors="pt") - query_embedding = model(**query_encoding).last_hidden_state.mean(dim=1) - scores = [] - # Get score for each document - for document in docs: - document_encoding = tokenizer( - document, return_tensors="pt", truncation=True, max_length=512 - ) - document_embedding = model(**document_encoding).last_hidden_state - # Calculate MaxSim score - score = self.maxsim(query_embedding.unsqueeze(0), document_embedding) - scores.append(score.item()) - - # replace the self.column column with the docs - result_set = result_set.drop(self.column) - result_set = result_set.append_column( - self.column, pa.array(docs, type=pa.string()) - ) # add the scores result_set = result_set.append_column( "_relevance_score", pa.array(scores, type=pa.float32()) @@ -110,31 +93,3 @@ class ColbertReranker(Reranker): result_set = result_set.sort_by([("_relevance_score", "descending")]) return result_set - - @cached_property - def _model(self): - transformers = attempt_import_or_raise("transformers") - tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) - model = transformers.AutoModel.from_pretrained(self.model_name) - - return tokenizer, model - - def maxsim(self, query_embedding, document_embedding): - # Expand dimensions for broadcasting - # Query: [batch, length, size] -> [batch, query, 1, size] - # Document: [batch, length, size] -> [batch, 1, length, size] - expanded_query = query_embedding.unsqueeze(2) - expanded_doc = document_embedding.unsqueeze(1) - - # Compute cosine similarity across the embedding dimension - sim_matrix = self.torch.nn.functional.cosine_similarity( - expanded_query, expanded_doc, dim=-1 - ) - - # Take the maximum similarity for each query token (across all document tokens) - # sim_matrix shape: [batch_size, query_length, doc_length] - max_sim_scores, _ = self.torch.max(sim_matrix, dim=2) - - # Average these maximum scores across all query tokens - avg_max_sim = self.torch.mean(max_sim_scores, dim=1) - return avg_max_sim diff --git a/python/python/lancedb/rerankers/cross_encoder.py b/python/python/lancedb/rerankers/cross_encoder.py index 88396fc3..05673673 100644 --- a/python/python/lancedb/rerankers/cross_encoder.py +++ b/python/python/lancedb/rerankers/cross_encoder.py @@ -42,7 +42,8 @@ class CrossEncoderReranker(Reranker): @cached_property def model(self): sbert = attempt_import_or_raise("sentence_transformers") - cross_encoder = sbert.CrossEncoder(self.model_name) + # Allows overriding the automatically selected device + cross_encoder = sbert.CrossEncoder(self.model_name, device=self.device) return cross_encoder diff --git a/python/python/tests/test_rerankers.py b/python/python/tests/test_rerankers.py index 2c27b61d..442328d9 100644 --- a/python/python/tests/test_rerankers.py +++ b/python/python/tests/test_rerankers.py @@ -236,33 +236,37 @@ def test_rrf_reranker(tmp_path, use_tantivy): @pytest.mark.skipif( os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set" ) -def test_cohere_reranker(tmp_path): +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_cohere_reranker(tmp_path, use_tantivy): pytest.importorskip("cohere") reranker = CohereReranker() - table, schema = get_test_table(tmp_path) + table, schema = get_test_table(tmp_path, use_tantivy) _run_test_reranker(reranker, table, "single player experience", None, schema) -def test_cross_encoder_reranker(tmp_path): +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_cross_encoder_reranker(tmp_path, use_tantivy): pytest.importorskip("sentence_transformers") reranker = CrossEncoderReranker() - table, schema = get_test_table(tmp_path) + table, schema = get_test_table(tmp_path, use_tantivy) _run_test_reranker(reranker, table, "single player experience", None, schema) -def test_colbert_reranker(tmp_path): +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_colbert_reranker(tmp_path, use_tantivy): pytest.importorskip("transformers") reranker = ColbertReranker() - table, schema = get_test_table(tmp_path) + table, schema = get_test_table(tmp_path, use_tantivy) _run_test_reranker(reranker, table, "single player experience", None, schema) @pytest.mark.skipif( os.environ.get("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY not set" ) -def test_openai_reranker(tmp_path): +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_openai_reranker(tmp_path, use_tantivy): pytest.importorskip("openai") - table, schema = get_test_table(tmp_path) + table, schema = get_test_table(tmp_path, use_tantivy) reranker = OpenaiReranker() _run_test_reranker(reranker, table, "single player experience", None, schema) @@ -270,8 +274,9 @@ def test_openai_reranker(tmp_path): @pytest.mark.skipif( os.environ.get("JINA_API_KEY") is None, reason="JINA_API_KEY not set" ) -def test_jina_reranker(tmp_path): +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_jina_reranker(tmp_path, use_tantivy): pytest.importorskip("jina") - table, schema = get_test_table(tmp_path) + table, schema = get_test_table(tmp_path, use_tantivy) reranker = JinaReranker() _run_test_reranker(reranker, table, "single player experience", None, schema) From 0cbc9cd551414a978688de16424ad2dc2c0af1cf Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Wed, 21 Aug 2024 20:37:04 +0530 Subject: [PATCH 13/34] docs: add evaluation example (#1552) before: ![Screenshot 2024-08-21 194228](https://github.com/user-attachments/assets/68d96658-7579-4934-85af-e8c898b64660) After: ![Screenshot 2024-08-21 195258](https://github.com/user-attachments/assets/81ddb9cd-cb93-47fc-a121-ff82701fd11f) --------- Co-authored-by: Ayush Chaurasia --- docs/mkdocs.yml | 2 ++ .../examples/python_examples/evaluations.md | 23 +++++++++++++++++++ .../examples/python_examples/multimodal.md | 4 ++-- docs/src/examples/python_examples/rag.md | 5 ++-- .../examples/python_examples/vector_search.md | 6 ++--- 5 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 docs/src/examples/python_examples/evaluations.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 024181fd..d70e89d6 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -147,6 +147,7 @@ nav: - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - Chatbot: examples/python_examples/chatbot.md + - Evaluation: examples/python_examples/evaluations.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb @@ -235,6 +236,7 @@ nav: - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - Chatbot: examples/python_examples/chatbot.md + - Evaluation: examples/python_examples/evaluations.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb diff --git a/docs/src/examples/python_examples/evaluations.md b/docs/src/examples/python_examples/evaluations.md new file mode 100644 index 00000000..9ee1a10a --- /dev/null +++ b/docs/src/examples/python_examples/evaluations.md @@ -0,0 +1,23 @@ +**Evaluation: Assessing Text Performance with Precision πŸ“ŠπŸ’‘** +==================================================================== + +**Evaluation Fundamentals πŸ“Š** + +Evaluation is a comprehensive tool designed to measure the performance of text-based inputs, enabling data-driven optimization and improvement πŸ“ˆ. + +**Text Evaluation 101 πŸ“š** + +By leveraging cutting-edge technologies, this provides a robust framework for evaluating reference and candidate texts across various metrics πŸ“Š, ensuring high-quality text outputs that meet specific requirements and standards πŸ“. + +| **Evaluation** | **Description** | **Links** | +| -------------- | --------------- | --------- | +| **Evaluating Prompts with Prompttools πŸ€–** | Compare, visualize & evaluate embedding functions (incl. OpenAI) across metrics like latency & custom evaluation πŸ“ˆπŸ“Š | [![Github](../../assets/github.svg)][prompttools_github]
[![Open In Collab](../../assets/colab.svg)][prompttools_colab] | +| **Evaluating RAG with RAGAs and GPT-4o πŸ“Š** | Evaluate RAG pipelines with cutting-edge metrics and tools, integrate with CI/CD for continuous performance checks, and generate responses with GPT-4o πŸ€–πŸ“ˆ | [![Github](../../assets/github.svg)][RAGAs_github]
[![Open In Collab](../../assets/colab.svg)][RAGAs_colab] | + + + +[prompttools_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/prompttools-eval-prompts +[prompttools_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/prompttools-eval-prompts/main.ipynb + +[RAGAs_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Evaluating_RAG_with_RAGAs +[RAGAs_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Evaluating_RAG_with_RAGAs/Evaluating_RAG_with_RAGAs.ipynb diff --git a/docs/src/examples/python_examples/multimodal.md b/docs/src/examples/python_examples/multimodal.md index 376d7864..28ddce00 100644 --- a/docs/src/examples/python_examples/multimodal.md +++ b/docs/src/examples/python_examples/multimodal.md @@ -1,10 +1,10 @@ # **Multimodal Search with LanceDB πŸ€Ήβ€β™‚οΈπŸ”** -Experience the future of search with LanceDB's multimodal capabilities. Combine text and image queries to find the most relevant results in your corpus and unlock new possibilities! πŸ”“πŸ’‘ +Experience the future of search with LanceDB's multimodal capabilities. Combine text and image queries to find the most relevant results in your corpus ! πŸ”“πŸ’‘ **Explore the Future of Search πŸš€** -Unlock the power of multimodal search with LanceDB, enabling efficient vector-based retrieval of text and image data! πŸ“ŠπŸ’» +LanceDB supports multimodal search by indexing and querying vector representations of text and image data πŸ€–. This enables efficient retrieval of relevant documents and images using vector-based similarity search πŸ“Š. The platform facilitates cross-modal search, allowing for text-image and image-text retrieval, and supports scalable indexing of high-dimensional vector spaces πŸ’». diff --git a/docs/src/examples/python_examples/rag.md b/docs/src/examples/python_examples/rag.md index 02339515..48a6411f 100644 --- a/docs/src/examples/python_examples/rag.md +++ b/docs/src/examples/python_examples/rag.md @@ -2,12 +2,11 @@ **RAG: Revolutionize Information Retrieval with LanceDB πŸ”“πŸ§** ==================================================================== -Unlock the full potential of Retrieval-Augmented Generation (RAG) with LanceDB, the ultimate solution for efficient vector-based information retrieval πŸ“Š. Input text queries and retrieve relevant documents with lightning-fast speed ⚑️ and accuracy βœ…. Generate comprehensive answers by combining retrieved information, uncovering new insights πŸ” and connections. +Unlock the full potential of Retrieval-Augmented Generation (RAG) with LanceDB, a solution for efficient vector-based information retrieval πŸ“Š. **Experience the Future of Search πŸ”„** -Experience the future of search with RAG, transforming information retrieval and answer generation. Apply RAG to various industries, streamlining processes πŸ“ˆ, saving time ⏰, and resources πŸ’°. Stay ahead of the curve with innovative technology πŸ”, powered by LanceDB. Discover the power of RAG with LanceDB and transform your industry with innovative solutions πŸ’‘. - +RAG integrates large language models (LLMs) with scalable knowledge bases, enabling efficient information retrieval and answer generation πŸ€–. By applying RAG to industry-specific use cases, developers can optimize query processing πŸ“Š, reduce response latency ⏱️, and improve resource utilization πŸ’». LanceDB provides a robust framework for integrating LLMs with external knowledge sources, facilitating accurate and informative responses πŸ“. | **RAG** | **Description** | **Links** | |----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------| diff --git a/docs/src/examples/python_examples/vector_search.md b/docs/src/examples/python_examples/vector_search.md index a7b91a16..d0713ef2 100644 --- a/docs/src/examples/python_examples/vector_search.md +++ b/docs/src/examples/python_examples/vector_search.md @@ -1,11 +1,11 @@ **Vector Search: Unlock Efficient Document Retrieval πŸ”“πŸ‘€** ==================================================================== -Unlock the power of vector search with LanceDB, a cutting-edge solution for efficient vector-based document retrieval πŸ“Š. Input text queries to find the most relevant documents from your corpus, and discover a new world of possibilities with our inbuilt hybrid search features 🌐. +Unlock the power of vector search with LanceDB, a cutting-edge solution for efficient vector-based document retrieval πŸ“Š. -**Unlock the Future of SearchπŸ”** +**Vector Search Capabilities in LanceDBπŸ”** -Experience the transformative power of vector search with LanceDB. Discover, analyze, and retrieve documents with unprecedented efficiency and accuracy. πŸ’‘ +LanceDB implements vector search algorithms for efficient document retrieval and analysis πŸ“Š. This enables fast and accurate discovery of relevant documents, leveraging dense vector representations πŸ€–. The platform supports scalable indexing and querying of high-dimensional vector spaces, facilitating precise document matching and retrieval πŸ“ˆ. | **Vector Search** | **Description** | **Links** | |:-----------------|:---------------|:---------| From 758c82858fb80eaae388561de36425e2c62b3275 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Thu, 22 Aug 2024 00:54:05 +0530 Subject: [PATCH 14/34] docs: add AI agent example (#1553) before: ![Screenshot 2024-08-21 225014](https://github.com/user-attachments/assets/e5b05586-87c5-4739-a4df-2d6cd0704ba5) After: ![Screenshot 2024-08-21 225029](https://github.com/user-attachments/assets/504959db-f560-49b2-9492-557e9846a793) --------- Co-authored-by: Ayush Chaurasia --- docs/mkdocs.yml | 2 ++ docs/src/examples/python_examples/aiagent.md | 27 ++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 docs/src/examples/python_examples/aiagent.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index d70e89d6..5c1413a8 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -148,6 +148,7 @@ nav: - Vector Search: examples/python_examples/vector_search.md - Chatbot: examples/python_examples/chatbot.md - Evaluation: examples/python_examples/evaluations.md + - AI Agent: examples/python_examples/aiagent.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb @@ -237,6 +238,7 @@ nav: - Vector Search: examples/python_examples/vector_search.md - Chatbot: examples/python_examples/chatbot.md - Evaluation: examples/python_examples/evaluations.md + - AI Agent: examples/python_examples/aiagent.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb diff --git a/docs/src/examples/python_examples/aiagent.md b/docs/src/examples/python_examples/aiagent.md new file mode 100644 index 00000000..12b624ae --- /dev/null +++ b/docs/src/examples/python_examples/aiagent.md @@ -0,0 +1,27 @@ +# AI Agents: Intelligent CollaborationπŸ€– + +Think of a platformπŸ’» where AI AgentsπŸ€– can seamlessly exchange information, coordinate over tasks, and achieve shared targets with great efficiencyπŸ“ˆπŸš€. + +## Vector-Based Coordination: The Technical Advantage +Leveraging LanceDB's vector-based capabilities, our coordination application enables AI agents to communicate and collaborate through dense vector representations πŸ€–. AI agents can exchange information, coordinate on a task or work towards a common goal, just by giving queriesπŸ“. + +| **AI Agents** | **Description** | **Links** | +|:--------------|:----------------|:----------| +| **AI Agents: Reducing HallucinationtπŸ“Š** | πŸ€–πŸ’‘ Reduce AI hallucinations using Critique-Based Contexting! Learn by Simplifying and Automating tedious workflows by going through fitness trainer agent example.πŸ’ͺ | [![Github](../../assets/github.svg)][hullucination_github]
[![Open In Collab](../../assets/colab.svg)][hullucination_colab]
[![Python](../../assets/python.svg)][hullucination_python]
[![Ghost](../../assets/ghost.svg)][hullucination_ghost] | +| **AI Trends Searcher: CrewAIπŸ”οΈ** | πŸ”οΈ Learn about CrewAI Agents ! Utilize the features of CrewAI - Role-based Agents, Task Management, and Inter-agent Delegation ! Make AI agents work together to do tricky stuff 😺| [![Github](../../assets/github.svg)][trend_github]
[![Open In Collab](../../assets/colab.svg)][trend_colab]
[![Ghost](../../assets/ghost.svg)][trend_ghost] | +| **SuperAgent AutogenπŸ€–** | πŸ’» AI interactions with the Super Agent! Integrating Autogen, LanceDB, LangChain, LiteLLM, and Ollama to create AI agent that excels in understanding and processing complex queries.πŸ€– | [![Github](../../assets/github.svg)][superagent_github]
[![Open In Collab](../../assets/colab.svg)][superagent_colab] | + + +[hullucination_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents +[hullucination_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents/main.ipynb +[hullucination_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents/main.py +[hullucination_ghost]: https://blog.lancedb.com/how-to-reduce-hallucinations-from-llm-powered-agents-using-long-term-memory-72f262c3cc1f/ + +[trend_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/AI-Trends-with-CrewAI +[trend_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/AI-Trends-with-CrewAI/CrewAI_AI_Trends.ipynb +[trend_ghost]: https://blog.lancedb.com/track-ai-trends-crewai-agents-rag/ + +[superagent_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/SuperAgent_Autogen +[superagent_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/SuperAgent_Autogen/main.ipynb + + From 6eb7ccfdee8e3f29f20e6d761d6a44f1d4146bfe Mon Sep 17 00:00:00 2001 From: Gagan Bhullar Date: Thu, 22 Aug 2024 00:16:36 -0600 Subject: [PATCH 15/34] fix: rerank attribute unknown (#1554) PR fixes #1550 --- python/python/lancedb/query.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 6c3c71bd..271028d9 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -457,6 +457,22 @@ class LanceQueryBuilder(ABC): }, ).explain_plan(verbose) + @abstractmethod + def rerank(self, reranker: Reranker) -> LanceQueryBuilder: + """Rerank the results using the specified reranker. + + Parameters + ---------- + reranker: Reranker + The reranker to use. + + Returns + ------- + + The LanceQueryBuilder object. + """ + raise NotImplementedError + class LanceVectorQueryBuilder(LanceQueryBuilder): """ @@ -841,6 +857,21 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder): limit=self._limit, ) + def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder: + """Rerank the results using the specified reranker. + + Parameters + ---------- + reranker: Reranker + The reranker to use. + + Returns + ------- + LanceEmptyQueryBuilder + The LanceQueryBuilder object. + """ + raise NotImplementedError("Reranking is not yet supported.") + class LanceHybridQueryBuilder(LanceQueryBuilder): """ From 6ad5553ecadb4e0bcebf493a4065dff5b383b2b6 Mon Sep 17 00:00:00 2001 From: rahuljo Date: Thu, 22 Aug 2024 11:48:49 +0200 Subject: [PATCH 16/34] docs: add dlt-lancedb integration page (#1551) Co-authored-by: Akela Drissner-Schmid <32450038+akelad@users.noreply.github.com> --- docs/mkdocs.yml | 2 + docs/src/integrations/dlt.md | 142 +++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 docs/src/integrations/dlt.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5c1413a8..5588c497 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -138,6 +138,7 @@ nav: - Pydantic: python/pydantic.md - Voxel51: integrations/voxel51.md - PromptTools: integrations/prompttools.md + - dlt: integrations/dlt.md - 🎯 Examples: - Overview: examples/index.md - 🐍 Python: @@ -228,6 +229,7 @@ nav: - Pydantic: python/pydantic.md - Voxel51: integrations/voxel51.md - PromptTools: integrations/prompttools.md + - dlt: integrations/dlt.md - Examples: - examples/index.md - 🐍 Python: diff --git a/docs/src/integrations/dlt.md b/docs/src/integrations/dlt.md new file mode 100644 index 00000000..009aa9d9 --- /dev/null +++ b/docs/src/integrations/dlt.md @@ -0,0 +1,142 @@ +# dlt + +[dlt](https://dlthub.com/docs/intro) is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets. dlt's [integration with LanceDB](https://dlthub.com/docs/dlt-ecosystem/destinations/lancedb) lets you ingest data from any source (databases, APIs, CSVs, dataframes, JSONs, and more) into LanceDB with a few lines of simple python code. The integration enables automatic normalization of nested data, schema inference, incremental loading and embedding the data. dlt also has integrations with several other tools like dbt, airflow, dagster etc. that can be inserted into your LanceDB workflow. + +## How to ingest data into LanceDB + +In this example, we will be fetching movie information from the [Open Movie Database (OMDb) API](https://www.omdbapi.com/) and loading it into a local LanceDB instance. To implement it, you will need an API key for the OMDb API (which can be created freely [here](https://www.omdbapi.com/apikey.aspx)). + +1. **Install `dlt` with LanceDB extras:** + ```sh + pip install dlt[lancedb] + ``` + +2. **Inside an empty directory, initialize a `dlt` project with:** + ```sh + dlt init rest_api lancedb + ``` + This will add all the files necessary to create a `dlt` pipeline that can ingest data from any REST API (ex: OMDb API) and load into LanceDB. + ```text + β”œβ”€β”€ .dlt + β”‚ β”œβ”€β”€ config.toml + β”‚ └── secrets.toml + β”œβ”€β”€ rest_api + β”œβ”€β”€ rest_api_pipeline.py + └── requirements.txt + ``` + + dlt has a list of pre-built [sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) like [SQL databases](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database), [REST APIs](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api), [Google Sheets](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets), [Notion](https://dlthub.com/docs/dlt-ecosystem/verified-sources/notion) etc., that can be used out-of-the-box by running `dlt init lancedb`. Since dlt is a python library, it is also very easy to modify these pre-built sources or to write your own custom source from scratch. + + +3. **Specify necessary credentials and/or embedding model details:** + + In order to fetch data from the OMDb API, you will need to pass a valid API key into your pipeline. Depending on whether you're using LanceDB OSS or LanceDB cloud, you also may need to provide the necessary credentials to connect to the LanceDB instance. These can be pasted inside `.dlt/sercrets.toml`. + + dlt's LanceDB integration also allows you to automatically embed the data during ingestion. Depending on the embedding model chosen, you may need to paste the necessary credentials inside `.dlt/sercrets.toml`: + ```toml + [sources.rest_api] + api_key = "api_key" # Enter the API key for the OMDb API + + [destination.lancedb] + embedding_model_provider = "sentence-transformers" + embedding_model = "all-MiniLM-L6-v2" + [destination.lancedb.credentials] + uri = ".lancedb" + api_key = "api_key" # API key to connect to LanceDB Cloud. Leave out if you are using LanceDB OSS. + embedding_model_provider_api_key = "embedding_model_provider_api_key" # Not needed for providers that don't need authentication (ollama, sentence-transformers). + ``` + See [here](https://dlthub.com/docs/dlt-ecosystem/destinations/lancedb#configure-the-destination) for more information and for a list of available models and model providers. + + +4. **Write the pipeline code inside `rest_api_pipeline.py`:** + + The following code shows how you can configure dlt's REST API source to connect to the [OMDb API](https://www.omdbapi.com/), fetch all movies with the word "godzilla" in the title, and load it into a LanceDB table. The REST API source allows you to pull data from any API with minimal code, to learn more read the [dlt docs](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api). + + ```python + + # Import necessary modules + import dlt + from rest_api import rest_api_source + + # Configure the REST API source + movies_source = rest_api_source( + { + "client": { + "base_url": "https://www.omdbapi.com/", + "auth": { # authentication strategy for the OMDb API + "type": "api_key", + "name": "apikey", + "api_key": dlt.secrets["sources.rest_api.api_token"], # read API credentials directly from secrets.toml + "location": "query" + }, + "paginator": { # pagination strategy for the OMDb API + "type": "page_number", + "base_page": 1, + "total_path": "totalResults", + "maximum_page": 5 + } + }, + "resources": [ # list of API endpoints to request + { + "name": "movie_search", + "endpoint": { + "path": "/", + "params": { + "s": "godzilla", + "type": "movie" + } + } + } + ] + }) + + + if __name__ == "__main__": + # Create a pipeline object + pipeline = dlt.pipeline( + pipeline_name='movies_pipeline', + destination='lancedb', # this tells dlt to load the data into LanceDB + dataset_name='movies_data_pipeline', + ) + + # Run the pipeline + load_info = pipeline.run(movies_source) + + # pretty print the information on data that was loaded + print(load_info) + ``` + + The script above will ingest the data into LanceDB as it is, i.e. without creating any embeddings. If we want to embed one of the fields (for example, `"Title"` that contains the movie titles), then we will use dlt's `lancedb_adapter` and modify the script as follows: + + - Add the following import statement: + ```python + from dlt.destinations.adapters import lancedb_adapter + ``` + - Modify the pipeline run like this: + ```python + load_info = pipeline.run( + lancedb_adapter( + movies_source, + embed="Title", + ) + ) + ``` + This will use the embedding model specified inside `.dlt/secrets.toml` to embed the field `"Title"`. + +5. **Install necessary dependencies:** + ```sh + pip install -r requirements.txt + ``` + + Note: You may need to install the dependencies for your embedding models separately. + ```sh + pip install sentence-transformers + ``` + +6. **Run the pipeline:** + Finally, running the following command will ingest the data into your LanceDB instance. + ```sh + python custom_source.py + ``` + +For more information and advanced usage of dlt's LanceDB integration, read [the dlt documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/lancedb). From 89bcc1b2e78b229323e446f46cbd04485131750f Mon Sep 17 00:00:00 2001 From: Lance Release Date: Fri, 23 Aug 2024 13:56:30 +0000 Subject: [PATCH 17/34] =?UTF-8?q?Bump=20version:=200.13.0-beta.0=20?= =?UTF-8?q?=E2=86=92=200.13.0-beta.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/.bumpversion.toml | 2 +- python/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/.bumpversion.toml b/python/.bumpversion.toml index c4dab165..862be0a8 100644 --- a/python/.bumpversion.toml +++ b/python/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "0.13.0-beta.0" +current_version = "0.13.0-beta.1" parse = """(?x) (?P0|[1-9]\\d*)\\. (?P0|[1-9]\\d*)\\. diff --git a/python/Cargo.toml b/python/Cargo.toml index 189f36ca..968e0109 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lancedb-python" -version = "0.13.0-beta.0" +version = "0.13.0-beta.1" edition.workspace = true description = "Python bindings for LanceDB" license.workspace = true From a9d0625e2bf05ce4b5b01b6dd2aa05b547e845a2 Mon Sep 17 00:00:00 2001 From: Lance Release Date: Fri, 23 Aug 2024 13:56:34 +0000 Subject: [PATCH 18/34] =?UTF-8?q?Bump=20version:=200.10.0-beta.0=20?= =?UTF-8?q?=E2=86=92=200.10.0-beta.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.toml | 2 +- node/package.json | 2 +- nodejs/npm/darwin-arm64/package.json | 2 +- nodejs/npm/darwin-x64/package.json | 2 +- nodejs/npm/linux-arm64-gnu/package.json | 2 +- nodejs/npm/linux-x64-gnu/package.json | 2 +- nodejs/npm/win32-x64-msvc/package.json | 2 +- nodejs/package.json | 2 +- rust/ffi/node/Cargo.toml | 2 +- rust/lancedb/Cargo.toml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 810b1fbe..817a1852 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "0.10.0-beta.0" +current_version = "0.10.0-beta.1" parse = """(?x) (?P0|[1-9]\\d*)\\. (?P0|[1-9]\\d*)\\. diff --git a/node/package.json b/node/package.json index 73855acc..cd8f55eb 100644 --- a/node/package.json +++ b/node/package.json @@ -1,6 +1,6 @@ { "name": "vectordb", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "description": " Serverless, low-latency vector database for AI applications", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/nodejs/npm/darwin-arm64/package.json b/nodejs/npm/darwin-arm64/package.json index e5e7ab0d..83c76de4 100644 --- a/nodejs/npm/darwin-arm64/package.json +++ b/nodejs/npm/darwin-arm64/package.json @@ -1,6 +1,6 @@ { "name": "@lancedb/lancedb-darwin-arm64", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "os": ["darwin"], "cpu": ["arm64"], "main": "lancedb.darwin-arm64.node", diff --git a/nodejs/npm/darwin-x64/package.json b/nodejs/npm/darwin-x64/package.json index 733d7a28..9595ea4d 100644 --- a/nodejs/npm/darwin-x64/package.json +++ b/nodejs/npm/darwin-x64/package.json @@ -1,6 +1,6 @@ { "name": "@lancedb/lancedb-darwin-x64", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "os": ["darwin"], "cpu": ["x64"], "main": "lancedb.darwin-x64.node", diff --git a/nodejs/npm/linux-arm64-gnu/package.json b/nodejs/npm/linux-arm64-gnu/package.json index 335b1753..b991d6f0 100644 --- a/nodejs/npm/linux-arm64-gnu/package.json +++ b/nodejs/npm/linux-arm64-gnu/package.json @@ -1,6 +1,6 @@ { "name": "@lancedb/lancedb-linux-arm64-gnu", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "os": ["linux"], "cpu": ["arm64"], "main": "lancedb.linux-arm64-gnu.node", diff --git a/nodejs/npm/linux-x64-gnu/package.json b/nodejs/npm/linux-x64-gnu/package.json index 987723d6..fc55a0dc 100644 --- a/nodejs/npm/linux-x64-gnu/package.json +++ b/nodejs/npm/linux-x64-gnu/package.json @@ -1,6 +1,6 @@ { "name": "@lancedb/lancedb-linux-x64-gnu", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "os": ["linux"], "cpu": ["x64"], "main": "lancedb.linux-x64-gnu.node", diff --git a/nodejs/npm/win32-x64-msvc/package.json b/nodejs/npm/win32-x64-msvc/package.json index 92682431..21941812 100644 --- a/nodejs/npm/win32-x64-msvc/package.json +++ b/nodejs/npm/win32-x64-msvc/package.json @@ -1,6 +1,6 @@ { "name": "@lancedb/lancedb-win32-x64-msvc", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "os": ["win32"], "cpu": ["x64"], "main": "lancedb.win32-x64-msvc.node", diff --git a/nodejs/package.json b/nodejs/package.json index 0ef1e711..5b483f34 100644 --- a/nodejs/package.json +++ b/nodejs/package.json @@ -10,7 +10,7 @@ "vector database", "ann" ], - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "main": "dist/index.js", "exports": { ".": "./dist/index.js", diff --git a/rust/ffi/node/Cargo.toml b/rust/ffi/node/Cargo.toml index 42a41f76..5654c963 100644 --- a/rust/ffi/node/Cargo.toml +++ b/rust/ffi/node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lancedb-node" -version = "0.10.0-beta.0" +version = "0.10.0-beta.1" description = "Serverless, low-latency vector database for AI applications" license.workspace = true edition.workspace = true diff --git a/rust/lancedb/Cargo.toml b/rust/lancedb/Cargo.toml index 893c8224..c547b882 100644 --- a/rust/lancedb/Cargo.toml +++ b/rust/lancedb/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lancedb" -version = "0.10.0-beta.0" +version = "0.10.0-beta.1" edition.workspace = true description = "LanceDB: A serverless, low-latency vector database for AI applications" license.workspace = true From 02d85a4ea4fbad15fad3d973820a4a83e6b71edd Mon Sep 17 00:00:00 2001 From: Lance Release Date: Fri, 23 Aug 2024 13:56:54 +0000 Subject: [PATCH 19/34] Updating package-lock.json --- node/package-lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node/package-lock.json b/node/package-lock.json index dbf6225a..7d579bad 100644 --- a/node/package-lock.json +++ b/node/package-lock.json @@ -1,12 +1,12 @@ { "name": "vectordb", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "vectordb", - "version": "0.10.0-beta.0", + "version": "0.10.0-beta.1", "cpu": [ "x64", "arm64" From 632007d0e2468a503b43acb8b30a41ad7db5c450 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Sun, 25 Aug 2024 12:30:30 +0530 Subject: [PATCH 20/34] docs: add recommender system example (#1561) before: ![Screenshot 2024-08-24 230216](https://github.com/user-attachments/assets/cc8a810a-b032-45d7-b086-b2ef0720dc16) After: ![Screenshot 2024-08-24 230228](https://github.com/user-attachments/assets/eaa1dc31-ac7f-4b81-aa79-b4cf94f0cbd5) --------- Co-authored-by: Ayush Chaurasia --- .github/workflows/docs_test.yml | 12 +++++- docs/mkdocs.yml | 2 + docs/src/examples/python_examples/rag.md | 4 +- .../python_examples/recommendersystem.md | 37 +++++++++++++++++++ .../examples/python_examples/vector_search.md | 2 +- 5 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 docs/src/examples/python_examples/recommendersystem.md diff --git a/.github/workflows/docs_test.yml b/.github/workflows/docs_test.yml index 6bfea4cf..cde9dc19 100644 --- a/.github/workflows/docs_test.yml +++ b/.github/workflows/docs_test.yml @@ -30,9 +30,13 @@ jobs: uses: actions/checkout@v4 - name: Print CPU capabilities run: cat /proc/cpuinfo + - name: Install protobuf + run: | + sudo apt update + sudo apt install -y protobuf-compiler - name: Install dependecies needed for ubuntu run: | - sudo apt install -y protobuf-compiler libssl-dev + sudo apt install -y libssl-dev rustup update && rustup default - name: Set up Python uses: actions/setup-python@v5 @@ -72,9 +76,13 @@ jobs: uses: actions/setup-node@v4 with: node-version: 20 + - name: Install protobuf + run: | + sudo apt update + sudo apt install -y protobuf-compiler - name: Install dependecies needed for ubuntu run: | - sudo apt install -y protobuf-compiler libssl-dev + sudo apt install -y libssl-dev rustup update && rustup default - name: Rust cache uses: swatinem/rust-cache@v2 diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5588c497..387db5c3 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -150,6 +150,7 @@ nav: - Chatbot: examples/python_examples/chatbot.md - Evaluation: examples/python_examples/evaluations.md - AI Agent: examples/python_examples/aiagent.md + - Recommender System: examples/python_examples/recommendersystem.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb @@ -241,6 +242,7 @@ nav: - Chatbot: examples/python_examples/chatbot.md - Evaluation: examples/python_examples/evaluations.md - AI Agent: examples/python_examples/aiagent.md + - Recommender System: examples/python_examples/recommendersystem.md - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb diff --git a/docs/src/examples/python_examples/rag.md b/docs/src/examples/python_examples/rag.md index 48a6411f..3d9f89fa 100644 --- a/docs/src/examples/python_examples/rag.md +++ b/docs/src/examples/python_examples/rag.md @@ -2,11 +2,11 @@ **RAG: Revolutionize Information Retrieval with LanceDB πŸ”“πŸ§** ==================================================================== -Unlock the full potential of Retrieval-Augmented Generation (RAG) with LanceDB, a solution for efficient vector-based information retrieval πŸ“Š. +Build RAG (Retrieval-Augmented Generation) with LanceDB, a powerful solution for efficient vector-based information retrieval πŸ“Š. **Experience the Future of Search πŸ”„** -RAG integrates large language models (LLMs) with scalable knowledge bases, enabling efficient information retrieval and answer generation πŸ€–. By applying RAG to industry-specific use cases, developers can optimize query processing πŸ“Š, reduce response latency ⏱️, and improve resource utilization πŸ’». LanceDB provides a robust framework for integrating LLMs with external knowledge sources, facilitating accurate and informative responses πŸ“. +πŸ€– RAG enables AI to **retrieve** relevant information from external sources and use it to **generate** more accurate and context-specific responses. πŸ’» LanceDB provides a robust framework for integrating LLMs with external knowledge sources πŸ“. | **RAG** | **Description** | **Links** | |----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------| diff --git a/docs/src/examples/python_examples/recommendersystem.md b/docs/src/examples/python_examples/recommendersystem.md new file mode 100644 index 00000000..ab7e4064 --- /dev/null +++ b/docs/src/examples/python_examples/recommendersystem.md @@ -0,0 +1,37 @@ +**Recommender Systems: Personalized DiscoveryπŸΏπŸ“Ί** +============================================================== +Deliver personalized experiences with Recommender Systems. 🎁 + +**Technical OverviewπŸ“œ** + +πŸ”οΈ LanceDB's powerful vector database capabilities can efficiently store and query item embeddings. Recommender Systems can utilize it and provide personalized recommendations based on user preferences 🀝 and item features πŸ“Š and therefore enhance the user experience.πŸ—‚οΈ + +| **Recommender System** | **Description** | **Links** | +| ---------------------- | --------------- | --------- | +| **Movie Recommender System🎬** | 🀝 Use **collaborative filtering** to predict user preferences, assuming similar users will like similar movies, and leverage **Singular Value Decomposition** (SVD) from Numpy for precise matrix factorization and accurate recommendationsπŸ“Š | [![Github](../../assets/github.svg)][movie_github]
[![Open In Collab](../../assets/colab.svg)][movie_colab]
[![Python](../../assets/python.svg)][movie_python] | +| **πŸŽ₯ Movie Recommendation with Genres** | πŸ” Creates movie embeddings using Doc2Vec, capturing genre and characteristic nuances, and leverages VectorDB for efficient storage and querying, enabling accurate genre classification and personalized movie recommendations through similarity searchesπŸŽ₯ | [![Github](../../assets/github.svg)][genre_github]
[![Open In Collab](../../assets/colab.svg)][genre_colab]
[![Ghost](../../assets/ghost.svg)][genre_ghost] | +| **πŸ›οΈ Product Recommender using Collaborative Filtering and LanceDB** | πŸ“ˆ Using **Collaborative Filtering** and **LanceDB** to analyze your past purchases, recommends products based on user's past purchases. Demonstrated with the Instacart dataset in our exampleπŸ›’ | [![Github](../../assets/github.svg)][product_github]
[![Open In Collab](../../assets/colab.svg)][product_colab]
[![Python](../../assets/python.svg)][product_python] | +| **πŸ” Arxiv Search with OpenCLIP and LanceDB** | πŸ’‘ Build a semantic search engine for Arxiv papers using LanceDB, and benchmarks its performance against traditional keyword-based search on Nomic's Atlas, to demonstrate the power of semantic search in finding relevant research papersπŸ“š | [![Github](../../assets/github.svg)][arxiv_github]
[![Open In Collab](../../assets/colab.svg)][arxiv_colab]
[![Python](../../assets/python.svg)][arxiv_python] | +| **Food Recommendation System🍴** | πŸ” Build a food recommendation system with LanceDB, featuring vector-based recommendations, full-text search, hybrid search, and reranking model integration for personalized and accurate food suggestionsπŸ‘Œ | [![Github](../../assets/github.svg)][food_github]
[![Open In Collab](../../assets/colab.svg)][food_colab] | + +[movie_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommender +[movie_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/movie-recommender/main.ipynb +[movie_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommender/main.py + + +[genre_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommendation-with-genres +[genre_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/movie-recommendation-with-genres/movie_recommendation_with_doc2vec_and_lancedb.ipynb +[genre_ghost]: https://blog.lancedb.com/movie-recommendation-system-using-lancedb-and-doc2vec/ + +[product_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/product-recommender +[product_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/product-recommender/main.ipynb +[product_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/product-recommender/main.py + + +[arxiv_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender +[arxiv_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender/main.ipynb +[arxiv_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender/main.py + + +[food_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Food_recommendation +[food_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Food_recommendation/main.ipynb diff --git a/docs/src/examples/python_examples/vector_search.md b/docs/src/examples/python_examples/vector_search.md index d0713ef2..7182eb09 100644 --- a/docs/src/examples/python_examples/vector_search.md +++ b/docs/src/examples/python_examples/vector_search.md @@ -1,7 +1,7 @@ **Vector Search: Unlock Efficient Document Retrieval πŸ”“πŸ‘€** ==================================================================== -Unlock the power of vector search with LanceDB, a cutting-edge solution for efficient vector-based document retrieval πŸ“Š. +Vector search with LanceDB, is a solution for efficient and accurate similarity searches in large datasets πŸ“Š. **Vector Search Capabilities in LanceDBπŸ”** From 549ca51a8aa7040ff555dc67228f05f82ebc8202 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Mon, 26 Aug 2024 13:25:10 +0530 Subject: [PATCH 21/34] feat: add answerdotai rerankers support and minor improvements (#1560) This PR: - Adds missing license headers - Integrates with answerdotai Rerankers package - Updates ColbertReranker to subclass answerdotai package. This is done to keep backwards compatibility as some users might be used to importing ColbertReranker directly - Set `trust_remote_code` to ` True` by default in CrossEncoder and sentence-transformer based rerankers --- docs/src/reranking/answerdotai.md | 74 ++++++++++++++ .../embeddings/sentence_transformers.py | 13 ++- .../python/lancedb/embeddings/transformers.py | 4 + python/python/lancedb/rerankers/__init__.py | 2 + .../python/lancedb/rerankers/answerdotai.py | 99 +++++++++++++++++++ python/python/lancedb/rerankers/base.py | 13 +++ python/python/lancedb/rerankers/cohere.py | 13 +++ python/python/lancedb/rerankers/colbert.py | 93 ++++------------- .../python/lancedb/rerankers/cross_encoder.py | 26 ++++- python/python/lancedb/rerankers/jinaai.py | 13 +++ .../lancedb/rerankers/linear_combination.py | 13 +++ python/python/lancedb/rerankers/openai.py | 13 +++ python/python/lancedb/rerankers/rrf.py | 13 +++ python/python/tests/test_rerankers.py | 11 ++- 14 files changed, 324 insertions(+), 76 deletions(-) create mode 100644 docs/src/reranking/answerdotai.md create mode 100644 python/python/lancedb/rerankers/answerdotai.py diff --git a/docs/src/reranking/answerdotai.md b/docs/src/reranking/answerdotai.md new file mode 100644 index 00000000..b19f24ff --- /dev/null +++ b/docs/src/reranking/answerdotai.md @@ -0,0 +1,74 @@ +# AnswersDotAI Rerankers + +This integration allows using answersdotai's rerankers to rerank the search results. [Rerankers](https://github.com/AnswerDotAI/rerankers) +A lightweight, low-dependency, unified API to use all common reranking and cross-encoder models. + +!!! note + Supported Query Types: Hybrid, Vector, FTS + + +```python +import numpy +import lancedb +from lancedb.embeddings import get_registry +from lancedb.pydantic import LanceModel, Vector +from lancedb.rerankers import AnswerdotaiRerankers + +embedder = get_registry().get("sentence-transformers").create() +db = lancedb.connect("~/.lancedb") + +class Schema(LanceModel): + text: str = embedder.SourceField() + vector: Vector(embedder.ndims()) = embedder.VectorField() + +data = [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] +tbl = db.create_table("test", schema=Schema, mode="overwrite") +tbl.add(data) +reranker = AnswerdotaiRerankers() + +# Run vector search with a reranker +result = tbl.search("hello").rerank(reranker=reranker).to_list() + +# Run FTS search with a reranker +result = tbl.search("hello", query_type="fts").rerank(reranker=reranker).to_list() + +# Run hybrid search with a reranker +tbl.create_fts_index("text", replace=True) +result = tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() + +``` + +Accepted Arguments +---------------- +| Argument | Type | Default | Description | +| --- | --- | --- | --- | +| `model_type` | `str` | `"colbert"` | The type of model to use. Supported model types can be found here - https://github.com/AnswerDotAI/rerankers | +| `model_name` | `str` | `"answerdotai/answerai-colbert-small-v1"` | The name of the reranker model to use. | +| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | +| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | + + + +## Supported Scores for each query type +You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: + +### Hybrid Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | βœ… Supported | Returns only have the `_relevance_score` column | +| `all` | ❌ Not Supported | Returns have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`) | + +### Vector Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | βœ… Supported | Returns only have the `_relevance_score` column | +| `all` | βœ… Supported | Returns have vector(`_distance`) along with Hybrid Search score(`_relevance_score`) | + +### FTS Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | βœ… Supported | Returns only have the `_relevance_score` column | +| `all` | βœ… Supported | Returns have FTS(`score`) along with Hybrid Search score(`_relevance_score`) | \ No newline at end of file diff --git a/python/python/lancedb/embeddings/sentence_transformers.py b/python/python/lancedb/embeddings/sentence_transformers.py index fe8e997d..b0ef1d50 100644 --- a/python/python/lancedb/embeddings/sentence_transformers.py +++ b/python/python/lancedb/embeddings/sentence_transformers.py @@ -26,12 +26,23 @@ class SentenceTransformerEmbeddings(TextEmbeddingFunction): An embedding function that uses the sentence-transformers library https://huggingface.co/sentence-transformers + + Parameters + ---------- + name: str, default "all-MiniLM-L6-v2" + The name of the model to use. + device: str, default "cpu" + The device to use for the model + normalize: bool, default True + Whether to normalize the embeddings + trust_remote_code: bool, default True + Whether to trust the remote code """ name: str = "all-MiniLM-L6-v2" device: str = "cpu" normalize: bool = True - trust_remote_code: bool = False + trust_remote_code: bool = True def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/python/python/lancedb/embeddings/transformers.py b/python/python/lancedb/embeddings/transformers.py index dba5b161..f532f7c9 100644 --- a/python/python/lancedb/embeddings/transformers.py +++ b/python/python/lancedb/embeddings/transformers.py @@ -36,6 +36,10 @@ class TransformersEmbeddingFunction(EmbeddingFunction): The name of the model to use. This should be a model name that can be loaded by transformers.AutoModel.from_pretrained. For example, "bert-base-uncased". default: "colbert-ir/colbertv2.0"" + device : str + The device to use for the model. Default is "cpu". + show_progress_bar : bool + Whether to show a progress bar when loading the model. Default is True. to download package, run : `pip install transformers` diff --git a/python/python/lancedb/rerankers/__init__.py b/python/python/lancedb/rerankers/__init__.py index 0b767a67..93903a16 100644 --- a/python/python/lancedb/rerankers/__init__.py +++ b/python/python/lancedb/rerankers/__init__.py @@ -6,6 +6,7 @@ from .linear_combination import LinearCombinationReranker from .openai import OpenaiReranker from .jinaai import JinaReranker from .rrf import RRFReranker +from .answerdotai import AnswerdotaiRerankers __all__ = [ "Reranker", @@ -16,4 +17,5 @@ __all__ = [ "ColbertReranker", "JinaReranker", "RRFReranker", + "AnswerdotaiRerankers", ] diff --git a/python/python/lancedb/rerankers/answerdotai.py b/python/python/lancedb/rerankers/answerdotai.py new file mode 100644 index 00000000..3c2fcb2d --- /dev/null +++ b/python/python/lancedb/rerankers/answerdotai.py @@ -0,0 +1,99 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pyarrow as pa +from .base import Reranker +from ..util import attempt_import_or_raise + + +class AnswerdotaiRerankers(Reranker): + """ + Reranks the results using the Answerdotai Rerank API. + All supported reranker model types can be found here: + - https://github.com/AnswerDotAI/rerankers + + + Parameters + ---------- + model_type : str, default "colbert" + The type of the model to use. + model_name : str, default "rerank-english-v2.0" + The name of the model to use from the given model type. + column : str, default "text" + The name of the column to use as input to the cross encoder model. + return_score : str, default "relevance" + options are "relevance" or "all". Only "relevance" is supported for now. + """ + + def __init__( + self, + model_type="colbert", + model_name: str = "answerdotai/answerai-colbert-small-v1", + column: str = "text", + return_score="relevance", + ): + super().__init__(return_score) + self.column = column + rerankers = attempt_import_or_raise( + "rerankers" + ) # import here for faster ops later + self.reranker = rerankers.Reranker(model_name, model_type) + + def _rerank(self, result_set: pa.Table, query: str): + docs = result_set[self.column].to_pylist() + doc_ids = list(range(len(docs))) + result = self.reranker.rank(query, docs, doc_ids=doc_ids) + + # get the scores of each document in the same order as the input + scores = [result.get_result_by_docid(i).score for i in doc_ids] + + # add the scores + result_set = result_set.append_column( + "_relevance_score", pa.array(scores, type=pa.float32()) + ) + return result_set + + def rerank_hybrid( + self, + query: str, + vector_results: pa.Table, + fts_results: pa.Table, + ): + combined_results = self.merge_results(vector_results, fts_results) + combined_results = self._rerank(combined_results, query) + if self.score == "relevance": + combined_results = self._keep_relevance_score(combined_results) + elif self.score == "all": + raise NotImplementedError( + "Answerdotai Reranker does not support score='all' yet" + ) + combined_results = combined_results.sort_by( + [("_relevance_score", "descending")] + ) + return combined_results + + def rerank_vector(self, query: str, vector_results: pa.Table): + vector_results = self._rerank(vector_results, query) + if self.score == "relevance": + vector_results = vector_results.drop_columns(["_distance"]) + + vector_results = vector_results.sort_by([("_relevance_score", "descending")]) + return vector_results + + def rerank_fts(self, query: str, fts_results: pa.Table): + fts_results = self._rerank(fts_results, query) + if self.score == "relevance": + fts_results = fts_results.drop_columns(["_score"]) + + fts_results = fts_results.sort_by([("_relevance_score", "descending")]) + + return fts_results diff --git a/python/python/lancedb/rerankers/base.py b/python/python/lancedb/rerankers/base.py index 8667ca9c..65ed43e7 100644 --- a/python/python/lancedb/rerankers/base.py +++ b/python/python/lancedb/rerankers/base.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from abc import ABC, abstractmethod from packaging.version import Version from typing import Union, List, TYPE_CHECKING diff --git a/python/python/lancedb/rerankers/cohere.py b/python/python/lancedb/rerankers/cohere.py index e4a12dbf..5cf7e8f0 100644 --- a/python/python/lancedb/rerankers/cohere.py +++ b/python/python/lancedb/rerankers/cohere.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from packaging.version import Version from functools import cached_property diff --git a/python/python/lancedb/rerankers/colbert.py b/python/python/lancedb/rerankers/colbert.py index 5e8701b3..cffdd0ba 100644 --- a/python/python/lancedb/rerankers/colbert.py +++ b/python/python/lancedb/rerankers/colbert.py @@ -1,10 +1,20 @@ -import pyarrow as pa +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -from ..util import attempt_import_or_raise -from .base import Reranker +from .answerdotai import AnswerdotaiRerankers -class ColbertReranker(Reranker): +class ColbertReranker(AnswerdotaiRerankers): """ Reranks the results using the ColBERT model. @@ -20,76 +30,13 @@ class ColbertReranker(Reranker): def __init__( self, - model_name: str = "colbert", + model_name: str = "colbert-ir/colbertv2.0", column: str = "text", return_score="relevance", ): - super().__init__(return_score) - self.model_name = model_name - self.column = column - rerankers = attempt_import_or_raise( - "rerankers" - ) # import here for faster ops later - self.colbert = rerankers.Reranker(self.model_name, model_type="colbert") - - def _rerank(self, result_set: pa.Table, query: str): - docs = result_set[self.column].to_pylist() - doc_ids = list(range(len(docs))) - result = self.colbert.rank(query, docs, doc_ids=doc_ids) - - # get the scores of each document in the same order as the input - scores = [result.get_result_by_docid(i).score for i in doc_ids] - - # add the scores - result_set = result_set.append_column( - "_relevance_score", pa.array(scores, type=pa.float32()) + super().__init__( + model_type="colbert", + model_name=model_name, + column=column, + return_score=return_score, ) - - return result_set - - def rerank_hybrid( - self, - query: str, - vector_results: pa.Table, - fts_results: pa.Table, - ): - combined_results = self.merge_results(vector_results, fts_results) - combined_results = self._rerank(combined_results, query) - if self.score == "relevance": - combined_results = self._keep_relevance_score(combined_results) - elif self.score == "all": - raise NotImplementedError( - "OpenAI Reranker does not support score='all' yet" - ) - - combined_results = combined_results.sort_by( - [("_relevance_score", "descending")] - ) - - return combined_results - - def rerank_vector( - self, - query: str, - vector_results: pa.Table, - ): - result_set = self._rerank(vector_results, query) - if self.score == "relevance": - result_set = result_set.drop_columns(["_distance"]) - - result_set = result_set.sort_by([("_relevance_score", "descending")]) - - return result_set - - def rerank_fts( - self, - query: str, - fts_results: pa.Table, - ): - result_set = self._rerank(fts_results, query) - if self.score == "relevance": - result_set = result_set.drop_columns(["_score"]) - - result_set = result_set.sort_by([("_relevance_score", "descending")]) - - return result_set diff --git a/python/python/lancedb/rerankers/cross_encoder.py b/python/python/lancedb/rerankers/cross_encoder.py index 05673673..6a6cb2bd 100644 --- a/python/python/lancedb/rerankers/cross_encoder.py +++ b/python/python/lancedb/rerankers/cross_encoder.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from functools import cached_property from typing import Union @@ -22,6 +35,11 @@ class CrossEncoderReranker(Reranker): device : str, default None The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". + return_score : str, default "relevance" + options are "relevance" or "all". Only "relevance" is supported for now. + trust_remote_code : bool, default True + If True, will trust the remote code to be safe. If False, will not trust + the remote code and will not run it """ def __init__( @@ -30,12 +48,14 @@ class CrossEncoderReranker(Reranker): column: str = "text", device: Union[str, None] = None, return_score="relevance", + trust_remote_code: bool = True, ): super().__init__(return_score) torch = attempt_import_or_raise("torch") self.model_name = model_name self.column = column self.device = device + self.trust_remote_code = trust_remote_code if self.device is None: self.device = "cuda" if torch.cuda.is_available() else "cpu" @@ -43,7 +63,11 @@ class CrossEncoderReranker(Reranker): def model(self): sbert = attempt_import_or_raise("sentence_transformers") # Allows overriding the automatically selected device - cross_encoder = sbert.CrossEncoder(self.model_name, device=self.device) + cross_encoder = sbert.CrossEncoder( + self.model_name, + device=self.device, + trust_remote_code=self.trust_remote_code, + ) return cross_encoder diff --git a/python/python/lancedb/rerankers/jinaai.py b/python/python/lancedb/rerankers/jinaai.py index 4d4edcfb..6be646bd 100644 --- a/python/python/lancedb/rerankers/jinaai.py +++ b/python/python/lancedb/rerankers/jinaai.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import requests from functools import cached_property diff --git a/python/python/lancedb/rerankers/linear_combination.py b/python/python/lancedb/rerankers/linear_combination.py index 3d7dcc25..6ab18427 100644 --- a/python/python/lancedb/rerankers/linear_combination.py +++ b/python/python/lancedb/rerankers/linear_combination.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pyarrow as pa from .base import Reranker diff --git a/python/python/lancedb/rerankers/openai.py b/python/python/lancedb/rerankers/openai.py index 7e6c19b2..76fe8e4c 100644 --- a/python/python/lancedb/rerankers/openai.py +++ b/python/python/lancedb/rerankers/openai.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json import os from functools import cached_property diff --git a/python/python/lancedb/rerankers/rrf.py b/python/python/lancedb/rerankers/rrf.py index 23ed1dc1..e0c95b48 100644 --- a/python/python/lancedb/rerankers/rrf.py +++ b/python/python/lancedb/rerankers/rrf.py @@ -1,3 +1,16 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Union, List, TYPE_CHECKING import pyarrow as pa diff --git a/python/python/tests/test_rerankers.py b/python/python/tests/test_rerankers.py index 442328d9..fca0850c 100644 --- a/python/python/tests/test_rerankers.py +++ b/python/python/tests/test_rerankers.py @@ -15,6 +15,7 @@ from lancedb.rerankers import ( CrossEncoderReranker, OpenaiReranker, JinaReranker, + AnswerdotaiRerankers, ) from lancedb.table import LanceTable @@ -254,12 +255,20 @@ def test_cross_encoder_reranker(tmp_path, use_tantivy): @pytest.mark.parametrize("use_tantivy", [True, False]) def test_colbert_reranker(tmp_path, use_tantivy): - pytest.importorskip("transformers") + pytest.importorskip("rerankers") reranker = ColbertReranker() table, schema = get_test_table(tmp_path, use_tantivy) _run_test_reranker(reranker, table, "single player experience", None, schema) +@pytest.mark.parametrize("use_tantivy", [True, False]) +def test_answerdotai_reranker(tmp_path, use_tantivy): + pytest.importorskip("rerankers") + reranker = AnswerdotaiRerankers() + table, schema = get_test_table(tmp_path, use_tantivy) + _run_test_reranker(reranker, table, "single player experience", None, schema) + + @pytest.mark.skipif( os.environ.get("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY not set" ) From 9c259981108768e6ab01cb5f7948389266f2b197 Mon Sep 17 00:00:00 2001 From: Bill Chambers Date: Mon, 26 Aug 2024 02:25:28 -0700 Subject: [PATCH 22/34] docs: update serverless_lancedb_with_s3_and_lambda.md (#1559) --- docs/src/examples/serverless_lancedb_with_s3_and_lambda.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md b/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md index b8058b9d..a63b4bae 100644 --- a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md +++ b/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md @@ -25,8 +25,8 @@ s3://eto-public/datasets/sift/vec_data.lance Then, we can write a quick Python script to populate our LanceDB Table: ```python -import pylance -sift_dataset = pylance.dataset("/path/to/local/vec_data.lance") +import lance +sift_dataset = lance.dataset("/path/to/local/vec_data.lance") df = sift_dataset.to_table().to_pandas() import lancedb From a85f03935297894070caec49ae89ec6c03bc5f5c Mon Sep 17 00:00:00 2001 From: Gagan Bhullar Date: Mon, 26 Aug 2024 15:25:14 -0600 Subject: [PATCH 23/34] fix(bug): limit fix (#1548) PR fixes #1151 --- python/python/lancedb/query.py | 5 ++++- python/python/tests/test_query.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 271028d9..53bcb434 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -357,7 +357,10 @@ class LanceQueryBuilder(ABC): The LanceQueryBuilder object. """ if limit is None or limit <= 0: - self._limit = None + if isinstance(self, LanceVectorQueryBuilder): + raise ValueError("Limit is required for ANN/KNN queries") + else: + self._limit = None else: self._limit = limit return self diff --git a/python/python/tests/test_query.py b/python/python/tests/test_query.py index 30eba26e..ae50c991 100644 --- a/python/python/tests/test_query.py +++ b/python/python/tests/test_query.py @@ -117,6 +117,18 @@ def test_query_builder(table): assert all(np.array(rs[0]["vector"]) == [1, 2]) +def test_vector_query_with_no_limit(table): + with pytest.raises(ValueError): + LanceVectorQueryBuilder(table, [0, 0], "vector").limit(0).select( + ["id", "vector"] + ).to_list() + + with pytest.raises(ValueError): + LanceVectorQueryBuilder(table, [0, 0], "vector").limit(None).select( + ["id", "vector"] + ).to_list() + + def test_query_builder_batches(table): rs = ( LanceVectorQueryBuilder(table, [0, 0], "vector") From ae85008714792a6b724c75793b63273c51caba88 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:14:35 +0530 Subject: [PATCH 24/34] docs: revamp embedding models (#1568) before: ![Screenshot 2024-08-27 151525](https://github.com/user-attachments/assets/d4f8f2b9-37e6-4a31-b144-01b804019e11) After: ![Screenshot 2024-08-27 151550](https://github.com/user-attachments/assets/79fe7d27-8f14-4d80-9b41-a1e91f8c708f) --------- Co-authored-by: Ayush Chaurasia --- docs/mkdocs.yml | 36 +- .../imagebind_embedding.md | 67 ++ .../jina_multimodal_embedding.md | 51 ++ .../openclip_embedding.md | 82 ++ .../aws_bedrock_embedding.md | 51 ++ .../cohere_embedding.md | 62 ++ .../gemini_embedding.md | 35 + .../huggingface_embedding.md | 24 + .../ibm_watsonx_ai_embedding.md | 75 ++ .../instructor_embedding.md | 50 ++ .../jina_embedding.md | 39 + .../ollama_embedding.md | 37 + .../openai_embedding.md | 34 + .../sentence_transformers.md | 174 ++++ .../embeddings/default_embedding_functions.md | 802 +----------------- docs/test/md_testing.py | 2 + 16 files changed, 833 insertions(+), 788 deletions(-) create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 387db5c3..e4346bcd 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -119,7 +119,23 @@ nav: - 🧬 Managing embeddings: - Overview: embeddings/index.md - Embedding functions: embeddings/embedding_functions.md - - Available models: embeddings/default_embedding_functions.md + - Available models: + - Overview: embeddings/default_embedding_functions.md + - Text Embedding Functions: + - Sentence Transformers: embeddings\available_embedding_models\text_embedding_functions\sentence_transformers.md + - Huggingface Embedding Models: embeddings\available_embedding_models\text_embedding_functions\huggingface_embedding.md + - Ollama Embeddings: embeddings\available_embedding_models\text_embedding_functions\ollama_embedding.md + - OpenAI Embeddings: embeddings\available_embedding_models\text_embedding_functions\openai_embedding.md + - Instructor Embeddings: embeddings\available_embedding_models\text_embedding_functions\instructor_embedding.md + - Gemini Embeddings: embeddings\available_embedding_models\text_embedding_functions\gemini_embedding.md + - Cohere Embeddings: embeddings\available_embedding_models\text_embedding_functions\cohere_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\text_embedding_functions\jina_embedding.md + - AWS Bedrock Text Embedding Functions: embeddings\available_embedding_models\text_embedding_functions\aws_bedrock_embedding.md + - IBM watsonx.ai Embeddings: embeddings\available_embedding_models\text_embedding_functions\ibm_watsonx_ai_embedding.md + - Multimodal Embedding Functions: + - OpenClip embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\openclip_embedding.md + - Imagebind embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\imagebind_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\jina_multimodal_embedding.md - User-defined embedding functions: embeddings/custom_embedding_function.md - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb @@ -215,7 +231,23 @@ nav: - Managing Embeddings: - Overview: embeddings/index.md - Embedding functions: embeddings/embedding_functions.md - - Available models: embeddings/default_embedding_functions.md + - Available models: + - Overview: embeddings/default_embedding_functions.md + - Text Embedding Functions: + - Sentence Transformers: embeddings\available_embedding_models\text_embedding_functions\sentence_transformers.md + - Huggingface Embedding Models: embeddings\available_embedding_models\text_embedding_functions\huggingface_embedding.md + - Ollama Embeddings: embeddings\available_embedding_models\text_embedding_functions\ollama_embedding.md + - OpenAI Embeddings: embeddings\available_embedding_models\text_embedding_functions\openai_embedding.md + - Instructor Embeddings: embeddings\available_embedding_models\text_embedding_functions\instructor_embedding.md + - Gemini Embeddings: embeddings\available_embedding_models\text_embedding_functions\gemini_embedding.md + - Cohere Embeddings: embeddings\available_embedding_models\text_embedding_functions\cohere_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\text_embedding_functions\jina_embedding.md + - AWS Bedrock Text Embedding Functions: embeddings\available_embedding_models\text_embedding_functions\aws_bedrock_embedding.md + - IBM watsonx.ai Embeddings: embeddings\available_embedding_models\text_embedding_functions\ibm_watsonx_ai_embedding.md + - Multimodal Embedding Functions: + - OpenClip embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\openclip_embedding.md + - Imagebind embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\imagebind_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\jina_multimodal_embedding.md - User-defined embedding functions: embeddings/custom_embedding_function.md - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md new file mode 100644 index 00000000..4aa8b3db --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md @@ -0,0 +1,67 @@ +# Imagebind embeddings +We have support for [imagebind](https://github.com/facebookresearch/ImageBind) model embeddings. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. + +This function is registered as `imagebind` and supports Audio, Video and Text modalities(extending to Thermal,Depth,IMU data): + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"imagebind_huge"` | Name of the model. | +| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | +| `normalize` | `bool` | `False` | set to `True` to normalize your inputs before model ingestion. | + +Below is an example demonstrating how the API works: + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect(tmp_path) +func = get_registry.get("imagebind").create() + +class ImageBindModel(LanceModel): + text: str + image_uri: str = func.SourceField() + audio_path: str + vector: Vector(func.ndims()) = func.VectorField() + +# add locally accessible image paths +text_list=["A dog.", "A car", "A bird"] +image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"] +audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"] + +# Load data +inputs = [ + {"text": a, "audio_path": b, "image_uri": c} + for a, b, c in zip(text_list, audio_paths, image_paths) +] + +#create table and add data +table = db.create_table("img_bind", schema=ImageBindModel) +table.add(inputs) +``` + +Now, we can search using any modality: + +#### image search +```python +query_image = "./assets/dog_image2.jpg" #download an image and enter that path here +actual = table.search(query_image).limit(1).to_pydantic(ImageBindModel)[0] +print(actual.text == "dog") +``` +#### audio search + +```python +query_audio = "./assets/car_audio2.wav" #download an audio clip and enter path here +actual = table.search(query_audio).limit(1).to_pydantic(ImageBindModel)[0] +print(actual.text == "car") +``` +#### Text search +You can add any input query and fetch the result as follows: +```python +query = "an animal which flies and tweets" +actual = table.search(query).limit(1).to_pydantic(ImageBindModel)[0] +print(actual.text == "bird") +``` + +If you have any questions about the embeddings API, supported models, or see a relevant model missing, please raise an issue [on GitHub](https://github.com/lancedb/lancedb/issues). diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md new file mode 100644 index 00000000..918c1509 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md @@ -0,0 +1,51 @@ +# Jina Embeddings : Multimodal + +Jina embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list +under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | + +Usage Example: + +```python + import os + import requests + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry + import pandas as pd + + os.environ['JINA_API_KEY'] = 'jina_*' + + db = lancedb.connect("~/.lancedb") + func = get_registry().get("jina").create() + + + class Images(LanceModel): + label: str + image_uri: str = func.SourceField() # image uri as the source + image_bytes: bytes = func.SourceField() # image bytes as the source + vector: Vector(func.ndims()) = func.VectorField() # vector column + vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column + + + table = db.create_table("images", schema=Images) + labels = ["cat", "cat", "dog", "dog", "horse", "horse"] + uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", + ] + # get each uri as bytes + image_bytes = [requests.get(uri).content for uri in uris] + table.add( + pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) + ) +``` diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md new file mode 100644 index 00000000..bf50dfd2 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md @@ -0,0 +1,82 @@ +# OpenClip embeddings +We support CLIP model embeddings using the open source alternative, [open-clip](https://github.com/mlfoundations/open_clip) which supports various customizations. It is registered as `open-clip` and supports the following customizations: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"ViT-B-32"` | The name of the model. | +| `pretrained` | `str` | `"laion2b_s34b_b79k"` | The name of the pretrained model to load. | +| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | +| `batch_size` | `int` | `64` | The number of images to process in a batch. | +| `normalize` | `bool` | `True` | Whether to normalize the input images before feeding them to the model. | + +This embedding function supports ingesting images as both bytes and urls. You can query them using both test and other images. + +!!! info + LanceDB supports ingesting images directly from accessible links. + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect(tmp_path) +func = get_registry.get("open-clip").create() + +class Images(LanceModel): + label: str + image_uri: str = func.SourceField() # image uri as the source + image_bytes: bytes = func.SourceField() # image bytes as the source + vector: Vector(func.ndims()) = func.VectorField() # vector column + vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column + +table = db.create_table("images", schema=Images) +labels = ["cat", "cat", "dog", "dog", "horse", "horse"] +uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", +] +# get each uri as bytes +image_bytes = [requests.get(uri).content for uri in uris] +table.add( + pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) +) +``` +Now we can search using text from both the default vector column and the custom vector column +```python + +# text search +actual = table.search("man's best friend").limit(1).to_pydantic(Images)[0] +print(actual.label) # prints "dog" + +frombytes = ( + table.search("man's best friend", vector_column_name="vec_from_bytes") + .limit(1) + .to_pydantic(Images)[0] +) +print(frombytes.label) + +``` + +Because we're using a multi-modal embedding function, we can also search using images + +```python +# image search +query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" +image_bytes = requests.get(query_image_uri).content +query_image = Image.open(io.BytesIO(image_bytes)) +actual = table.search(query_image).limit(1).to_pydantic(Images)[0] +print(actual.label == "dog") + +# image search using a custom vector column +other = ( + table.search(query_image, vector_column_name="vec_from_bytes") + .limit(1) + .to_pydantic(Images)[0] +) +print(actual.label) + +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md new file mode 100644 index 00000000..036d4b82 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md @@ -0,0 +1,51 @@ +# AWS Bedrock Text Embedding Functions + +AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. +You can do so by using `awscli` and also add your session_token: +```shell +aws configure +aws configure set aws_session_token "" +``` +to ensure that the credentials are set up correctly, you can run the following command: +```shell +aws sts get-caller-identity +``` + +Supported Embedding modelIDs are: +* `amazon.titan-embed-text-v1` +* `cohere.embed-english-v3` +* `cohere.embed-multilingual-v3` + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| **name** | str | "amazon.titan-embed-text-v1" | The model ID of the bedrock model to use. Supported base models for Text Embeddings: amazon.titan-embed-text-v1, cohere.embed-english-v3, cohere.embed-multilingual-v3 | +| **region** | str | "us-east-1" | Optional name of the AWS Region in which the service should be called (e.g., "us-east-1"). | +| **profile_name** | str | None | Optional name of the AWS profile to use for calling the Bedrock service. If not specified, the default profile will be used. | +| **assumed_role** | str | None | Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not specified, the current active credentials will be used. | +| **role_session_name** | str | "lancedb-embeddings" | Optional name of the AWS IAM role session to use for calling the Bedrock service. If not specified, a "lancedb-embeddings" name will be used. | +| **runtime** | bool | True | Optional choice of getting different client to perform operations with the Amazon Bedrock service. | +| **max_retries** | int | 7 | Optional number of retries to perform when a request fails. | + +Usage Example: + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry +import pandas as pd + +model = get_registry().get("bedrock-text").create() + +class TextModel(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) +db = lancedb.connect("tmp_path") +tbl = db.create_table("test", schema=TextModel, mode="overwrite") + +tbl.add(df) +rs = tbl.search("hello").limit(1).to_pandas() +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md new file mode 100644 index 00000000..39eba18c --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md @@ -0,0 +1,62 @@ +# Cohere Embeddings + +Using cohere API requires cohere package, which can be installed using `pip install cohere`. Cohere embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. +You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API. + +Supported models are: +* embed-english-v3.0 +* embed-multilingual-v3.0 +* embed-english-light-v3.0 +* embed-multilingual-light-v3.0 +* embed-english-v2.0 +* embed-english-light-v2.0 +* embed-multilingual-v2.0 + + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|--------|---------| +| `name` | `str` | `"embed-english-v2.0"` | The model ID of the cohere model to use. Supported base models for Text Embeddings: embed-english-v3.0, embed-multilingual-v3.0, embed-english-light-v3.0, embed-multilingual-light-v3.0, embed-english-v2.0, embed-english-light-v2.0, embed-multilingual-v2.0 | +| `source_input_type` | `str` | `"search_document"` | The type of input data to be used for the source column. | +| `query_input_type` | `str` | `"search_query"` | The type of input data to be used for the query. | + +Cohere supports following input types: + +| Input Type | Description | +|-------------------------|---------------------------------------| +| "`search_document`" | Used for embeddings stored in a vector| +| | database for search use-cases. | +| "`search_query`" | Used for embeddings of search queries | +| | run against a vector DB | +| "`semantic_similarity`" | Specifies the given text will be used | +| | for Semantic Textual Similarity (STS) | +| "`classification`" | Used for embeddings passed through a | +| | text classifier. | +| "`clustering`" | Used for the embeddings run through a | +| | clustering algorithm | + +Usage Example: + +```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import EmbeddingFunctionRegistry + + cohere = EmbeddingFunctionRegistry + .get_instance() + .get("cohere") + .create(name="embed-multilingual-v2.0") + + class TextModel(LanceModel): + text: str = cohere.SourceField() + vector: Vector(cohere.ndims()) = cohere.VectorField() + + data = [ { "text": "hello world" }, + { "text": "goodbye world" }] + + db = lancedb.connect("~/.lancedb") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(data) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md new file mode 100644 index 00000000..551c8327 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md @@ -0,0 +1,35 @@ +# Gemini Embeddings +With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide. +The Gemini Embedding Model API supports various task types: + +| Task Type | Description | +|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| +| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. | +| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API | +| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). | +| "`classification`" | Specifies that the embeddings will be used for classification. | +| "`clusering`" | Specifies that the embeddings will be used for clustering. | + + +Usage Example: + +```python +import lancedb +import pandas as pd +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + + +model = get_registry().get("gemini-text").create() + +class TextModel(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) +db = lancedb.connect("~/.lancedb") +tbl = db.create_table("test", schema=TextModel, mode="overwrite") + +tbl.add(df) +rs = tbl.search("hello").limit(1).to_pandas() +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md new file mode 100644 index 00000000..80502b49 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md @@ -0,0 +1,24 @@ +# Huggingface embedding models +We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")` + +Example usage - +```python +import lancedb +import pandas as pd + +from lancedb.embeddings import get_registry +from lancedb.pydantic import LanceModel, Vector + +model = get_registry().get("huggingface").create(name='facebook/bart-base') + +class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]}) +table = db.create_table("greets", schema=Words) +table.add(df) +query = "old greeting" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md new file mode 100644 index 00000000..d98fdeef --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md @@ -0,0 +1,75 @@ +# IBM watsonx.ai Embeddings + +Generate text embeddings using IBM's watsonx.ai platform. + +## Supported Models + +You can find a list of supported models at [IBM watsonx.ai Documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). The currently supported model names are: + +- `ibm/slate-125m-english-rtrvr` +- `ibm/slate-30m-english-rtrvr` +- `sentence-transformers/all-minilm-l12-v2` +- `intfloat/multilingual-e5-large` + +## Parameters + +The following parameters can be passed to the `create` method: + +| Parameter | Type | Default Value | Description | +|------------|----------|----------------------------------|-----------------------------------------------------------| +| name | str | "ibm/slate-125m-english-rtrvr" | The model ID of the watsonx.ai model to use | +| api_key | str | None | Optional IBM Cloud API key (or set `WATSONX_API_KEY`) | +| project_id | str | None | Optional watsonx project ID (or set `WATSONX_PROJECT_ID`) | +| url | str | None | Optional custom URL for the watsonx.ai instance | +| params | dict | None | Optional additional parameters for the embedding model | + +## Usage Example + +First, the watsonx.ai library is an optional dependency, so must be installed seperately: + +``` +pip install ibm-watsonx-ai +``` + +Optionally set environment variables (if not passing credentials to `create` directly): + +```sh +export WATSONX_API_KEY="YOUR_WATSONX_API_KEY" +export WATSONX_PROJECT_ID="YOUR_WATSONX_PROJECT_ID" +``` + +```python +import os +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import EmbeddingFunctionRegistry + +watsonx_embed = EmbeddingFunctionRegistry + .get_instance() + .get("watsonx") + .create( + name="ibm/slate-125m-english-rtrvr", + # Uncomment and set these if not using environment variables + # api_key="your_api_key_here", + # project_id="your_project_id_here", + # url="your_watsonx_url_here", + # params={...}, + ) + +class TextModel(LanceModel): + text: str = watsonx_embed.SourceField() + vector: Vector(watsonx_embed.ndims()) = watsonx_embed.VectorField() + +data = [ + {"text": "hello world"}, + {"text": "goodbye world"}, +] + +db = lancedb.connect("~/.lancedb") +tbl = db.create_table("watsonx_test", schema=TextModel, mode="overwrite") + +tbl.add(data) + +rs = tbl.search("hello").limit(1).to_pandas() +print(rs) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md new file mode 100644 index 00000000..30662f21 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md @@ -0,0 +1,50 @@ +# Instructor Embeddings +[Instructor](https://instructor-embedding.github.io/) is an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g. classification, retrieval, clustering, text evaluation, etc.) and domains (e.g. science, finance, etc.) by simply providing the task instruction, without any finetuning. + +If you want to calculate customized embeddings for specific sentences, you can follow the unified template to write instructions. + +!!! info + Represent the `domain` `text_type` for `task_objective`: + + * `domain` is optional, and it specifies the domain of the text, e.g. science, finance, medicine, etc. + * `text_type` is required, and it specifies the encoding unit, e.g. sentence, document, paragraph, etc. + * `task_objective` is optional, and it specifies the objective of embedding, e.g. retrieve a document, classify the sentence, etc. + +More information about the model can be found at the [source URL](https://github.com/xlang-ai/instructor-embedding). + +| Argument | Type | Default | Description | +|---|---|---|---| +| `name` | `str` | "hkunlp/instructor-base" | The name of the model to use | +| `batch_size` | `int` | `32` | The batch size to use when generating embeddings | +| `device` | `str` | `"cpu"` | The device to use when generating embeddings | +| `show_progress_bar` | `bool` | `True` | Whether to show a progress bar when generating embeddings | +| `normalize_embeddings` | `bool` | `True` | Whether to normalize the embeddings | +| `quantize` | `bool` | `False` | Whether to quantize the model | +| `source_instruction` | `str` | `"represent the docuement for retreival"` | The instruction for the source column | +| `query_instruction` | `str` | `"represent the document for retreiving the most similar documents"` | The instruction for the query | + + + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry, InstuctorEmbeddingFunction + +instructor = get_registry().get("instructor").create( + source_instruction="represent the docuement for retreival", + query_instruction="represent the document for retreiving the most similar documents" + ) + +class Schema(LanceModel): + vector: Vector(instructor.ndims()) = instructor.VectorField() + text: str = instructor.SourceField() + +db = lancedb.connect("~/.lancedb") +tbl = db.create_table("test", schema=Schema, mode="overwrite") + +texts = [{"text": "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that..."}, + {"text": "The disparate impact theory is especially controversial under the Fair Housing Act because the Act..."}, + {"text": "Disparate impact in United States labor law refers to practices in employment, housing, and other areas that.."}] + +tbl.add(texts) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md new file mode 100644 index 00000000..dc194c5d --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md @@ -0,0 +1,39 @@ +# Jina Embeddings + +Jina embeddings are used to generate embeddings for text and image data. +You also need to set the `JINA_API_KEY` environment variable to use the Jina API. + +You can find a list of supported models under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | + +Usage Example: + +```python + import os + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import EmbeddingFunctionRegistry + + os.environ['JINA_API_KEY'] = 'jina_*' + + jina_embed = EmbeddingFunctionRegistry.get_instance().get("jina").create(name="jina-embeddings-v2-base-en") + + + class TextModel(LanceModel): + text: str = jina_embed.SourceField() + vector: Vector(jina_embed.ndims()) = jina_embed.VectorField() + + + data = [{"text": "hello world"}, + {"text": "goodbye world"}] + + db = lancedb.connect("~/.lancedb-2") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(data) +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md new file mode 100644 index 00000000..3b8cfcce --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md @@ -0,0 +1,37 @@ +# Ollama embeddings + +Generate embeddings via the [ollama](https://github.com/ollama/ollama-python) python library. More details: + +- [Ollama docs on embeddings](https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings) +- [Ollama blog on embeddings](https://ollama.com/blog/embedding-models) + +| Parameter | Type | Default Value | Description | +|------------------------|----------------------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| `name` | `str` | `nomic-embed-text` | The name of the model. | +| `host` | `str` | `http://localhost:11434` | The Ollama host to connect to. | +| `options` | `ollama.Options` or `dict` | `None` | Additional model parameters listed in the documentation for the Modelfile such as `temperature`. | +| `keep_alive` | `float` or `str` | `"5m"` | Controls how long the model will stay loaded into memory following the request. | +| `ollama_client_kwargs` | `dict` | `{}` | kwargs that can be past to the `ollama.Client`. | + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect("/tmp/db") +func = get_registry().get("ollama").create(name="nomic-embed-text") + +class Words(LanceModel): + text: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() + +table = db.create_table("words", schema=Words, mode="overwrite") +table.add([ + {"text": "hello world"}, + {"text": "goodbye world"} +]) + +query = "greetings" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md new file mode 100644 index 00000000..87fd28f1 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md @@ -0,0 +1,34 @@ +# OpenAI embeddings + +LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"text-embedding-ada-002"` | The name of the model. | +| `dim` | `int` | Model default | For OpenAI's newer text-embedding-3 model, we can specify a dimensionality that is smaller than the 1536 size. This feature supports it | + + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect("/tmp/db") +func = get_registry().get("openai").create(name="text-embedding-ada-002") + +class Words(LanceModel): + text: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() + +table = db.create_table("words", schema=Words, mode="overwrite") +table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) + +query = "greetings" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md new file mode 100644 index 00000000..1adff158 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md @@ -0,0 +1,174 @@ +# Sentence transformers +Allows you to set parameters when registering a `sentence-transformers` object. + +!!! info + Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search. + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `all-MiniLM-L6-v2` | The name of the model | +| `device` | `str` | `cpu` | The device to run the model on (can be `cpu` or `gpu`) | +| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model | +| `trust_remote_code` | `bool` | `False` | Whether to trust and execute remote code from the model's Huggingface repository | + + +??? "Check out available sentence-transformer models here!" + ```markdown + - sentence-transformers/all-MiniLM-L12-v2 + - sentence-transformers/paraphrase-mpnet-base-v2 + - sentence-transformers/gtr-t5-base + - sentence-transformers/LaBSE + - sentence-transformers/all-MiniLM-L6-v2 + - sentence-transformers/bert-base-nli-max-tokens + - sentence-transformers/bert-base-nli-mean-tokens + - sentence-transformers/bert-base-nli-stsb-mean-tokens + - sentence-transformers/bert-base-wikipedia-sections-mean-tokens + - sentence-transformers/bert-large-nli-cls-token + - sentence-transformers/bert-large-nli-max-tokens + - sentence-transformers/bert-large-nli-mean-tokens + - sentence-transformers/bert-large-nli-stsb-mean-tokens + - sentence-transformers/distilbert-base-nli-max-tokens + - sentence-transformers/distilbert-base-nli-mean-tokens + - sentence-transformers/distilbert-base-nli-stsb-mean-tokens + - sentence-transformers/distilroberta-base-msmarco-v1 + - sentence-transformers/distilroberta-base-msmarco-v2 + - sentence-transformers/nli-bert-base-cls-pooling + - sentence-transformers/nli-bert-base-max-pooling + - sentence-transformers/nli-bert-base + - sentence-transformers/nli-bert-large-cls-pooling + - sentence-transformers/nli-bert-large-max-pooling + - sentence-transformers/nli-bert-large + - sentence-transformers/nli-distilbert-base-max-pooling + - sentence-transformers/nli-distilbert-base + - sentence-transformers/nli-roberta-base + - sentence-transformers/nli-roberta-large + - sentence-transformers/roberta-base-nli-mean-tokens + - sentence-transformers/roberta-base-nli-stsb-mean-tokens + - sentence-transformers/roberta-large-nli-mean-tokens + - sentence-transformers/roberta-large-nli-stsb-mean-tokens + - sentence-transformers/stsb-bert-base + - sentence-transformers/stsb-bert-large + - sentence-transformers/stsb-distilbert-base + - sentence-transformers/stsb-roberta-base + - sentence-transformers/stsb-roberta-large + - sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens + - sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens + - sentence-transformers/xlm-r-base-en-ko-nli-ststb + - sentence-transformers/xlm-r-bert-base-nli-mean-tokens + - sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens + - sentence-transformers/xlm-r-large-en-ko-nli-ststb + - sentence-transformers/bert-base-nli-cls-token + - sentence-transformers/all-distilroberta-v1 + - sentence-transformers/multi-qa-MiniLM-L6-dot-v1 + - sentence-transformers/multi-qa-distilbert-cos-v1 + - sentence-transformers/multi-qa-distilbert-dot-v1 + - sentence-transformers/multi-qa-mpnet-base-cos-v1 + - sentence-transformers/multi-qa-mpnet-base-dot-v1 + - sentence-transformers/nli-distilroberta-base-v2 + - sentence-transformers/all-MiniLM-L6-v1 + - sentence-transformers/all-mpnet-base-v1 + - sentence-transformers/all-mpnet-base-v2 + - sentence-transformers/all-roberta-large-v1 + - sentence-transformers/allenai-specter + - sentence-transformers/average_word_embeddings_glove.6B.300d + - sentence-transformers/average_word_embeddings_glove.840B.300d + - sentence-transformers/average_word_embeddings_komninos + - sentence-transformers/average_word_embeddings_levy_dependency + - sentence-transformers/clip-ViT-B-32-multilingual-v1 + - sentence-transformers/clip-ViT-B-32 + - sentence-transformers/distilbert-base-nli-stsb-quora-ranking + - sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking + - sentence-transformers/distilroberta-base-paraphrase-v1 + - sentence-transformers/distiluse-base-multilingual-cased-v1 + - sentence-transformers/distiluse-base-multilingual-cased-v2 + - sentence-transformers/distiluse-base-multilingual-cased + - sentence-transformers/facebook-dpr-ctx_encoder-multiset-base + - sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base + - sentence-transformers/facebook-dpr-question_encoder-multiset-base + - sentence-transformers/facebook-dpr-question_encoder-single-nq-base + - sentence-transformers/gtr-t5-large + - sentence-transformers/gtr-t5-xl + - sentence-transformers/gtr-t5-xxl + - sentence-transformers/msmarco-MiniLM-L-12-v3 + - sentence-transformers/msmarco-MiniLM-L-6-v3 + - sentence-transformers/msmarco-MiniLM-L12-cos-v5 + - sentence-transformers/msmarco-MiniLM-L6-cos-v5 + - sentence-transformers/msmarco-bert-base-dot-v5 + - sentence-transformers/msmarco-bert-co-condensor + - sentence-transformers/msmarco-distilbert-base-dot-prod-v3 + - sentence-transformers/msmarco-distilbert-base-tas-b + - sentence-transformers/msmarco-distilbert-base-v2 + - sentence-transformers/msmarco-distilbert-base-v3 + - sentence-transformers/msmarco-distilbert-base-v4 + - sentence-transformers/msmarco-distilbert-cos-v5 + - sentence-transformers/msmarco-distilbert-dot-v5 + - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned + - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch + - sentence-transformers/msmarco-distilroberta-base-v2 + - sentence-transformers/msmarco-roberta-base-ance-firstp + - sentence-transformers/msmarco-roberta-base-v2 + - sentence-transformers/msmarco-roberta-base-v3 + - sentence-transformers/multi-qa-MiniLM-L6-cos-v1 + - sentence-transformers/nli-mpnet-base-v2 + - sentence-transformers/nli-roberta-base-v2 + - sentence-transformers/nq-distilbert-base-v1 + - sentence-transformers/paraphrase-MiniLM-L12-v2 + - sentence-transformers/paraphrase-MiniLM-L3-v2 + - sentence-transformers/paraphrase-MiniLM-L6-v2 + - sentence-transformers/paraphrase-TinyBERT-L6-v2 + - sentence-transformers/paraphrase-albert-base-v2 + - sentence-transformers/paraphrase-albert-small-v2 + - sentence-transformers/paraphrase-distilroberta-base-v1 + - sentence-transformers/paraphrase-distilroberta-base-v2 + - sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + - sentence-transformers/paraphrase-multilingual-mpnet-base-v2 + - sentence-transformers/paraphrase-xlm-r-multilingual-v1 + - sentence-transformers/quora-distilbert-base + - sentence-transformers/quora-distilbert-multilingual + - sentence-transformers/sentence-t5-base + - sentence-transformers/sentence-t5-large + - sentence-transformers/sentence-t5-xxl + - sentence-transformers/sentence-t5-xl + - sentence-transformers/stsb-distilroberta-base-v2 + - sentence-transformers/stsb-mpnet-base-v2 + - sentence-transformers/stsb-roberta-base-v2 + - sentence-transformers/stsb-xlm-r-multilingual + - sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1 + - sentence-transformers/clip-ViT-L-14 + - sentence-transformers/clip-ViT-B-16 + - sentence-transformers/use-cmlm-multilingual + - sentence-transformers/all-MiniLM-L12-v1 + ``` + +!!! info + You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc. + See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers). + +!!! note "BAAI Embeddings example" + Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) + ```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry + + db = lancedb.connect("/tmp/db") + model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") + + class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + + table = db.create_table("words", schema=Words) + table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) + + query = "greetings" + actual = table.search(query).limit(1).to_pydantic(Words)[0] + print(actual.text) + ``` +Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. + diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index 95122437..ced97048 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -6,795 +6,25 @@ Contains the text embedding functions registered by default. * Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff. * Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7. -### Sentence transformers -Allows you to set parameters when registering a `sentence-transformers` object. +**Available Text Embeddings**: + +- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md) +- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md) +- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md) +- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md) +- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md) +- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md) +- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md) +- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md) +- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) +- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) -!!! info - Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search. - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `all-MiniLM-L6-v2` | The name of the model | -| `device` | `str` | `cpu` | The device to run the model on (can be `cpu` or `gpu`) | -| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model | -| `trust_remote_code` | `bool` | `False` | Whether to trust and execute remote code from the model's Huggingface repository | - - -??? "Check out available sentence-transformer models here!" - ```markdown - - sentence-transformers/all-MiniLM-L12-v2 - - sentence-transformers/paraphrase-mpnet-base-v2 - - sentence-transformers/gtr-t5-base - - sentence-transformers/LaBSE - - sentence-transformers/all-MiniLM-L6-v2 - - sentence-transformers/bert-base-nli-max-tokens - - sentence-transformers/bert-base-nli-mean-tokens - - sentence-transformers/bert-base-nli-stsb-mean-tokens - - sentence-transformers/bert-base-wikipedia-sections-mean-tokens - - sentence-transformers/bert-large-nli-cls-token - - sentence-transformers/bert-large-nli-max-tokens - - sentence-transformers/bert-large-nli-mean-tokens - - sentence-transformers/bert-large-nli-stsb-mean-tokens - - sentence-transformers/distilbert-base-nli-max-tokens - - sentence-transformers/distilbert-base-nli-mean-tokens - - sentence-transformers/distilbert-base-nli-stsb-mean-tokens - - sentence-transformers/distilroberta-base-msmarco-v1 - - sentence-transformers/distilroberta-base-msmarco-v2 - - sentence-transformers/nli-bert-base-cls-pooling - - sentence-transformers/nli-bert-base-max-pooling - - sentence-transformers/nli-bert-base - - sentence-transformers/nli-bert-large-cls-pooling - - sentence-transformers/nli-bert-large-max-pooling - - sentence-transformers/nli-bert-large - - sentence-transformers/nli-distilbert-base-max-pooling - - sentence-transformers/nli-distilbert-base - - sentence-transformers/nli-roberta-base - - sentence-transformers/nli-roberta-large - - sentence-transformers/roberta-base-nli-mean-tokens - - sentence-transformers/roberta-base-nli-stsb-mean-tokens - - sentence-transformers/roberta-large-nli-mean-tokens - - sentence-transformers/roberta-large-nli-stsb-mean-tokens - - sentence-transformers/stsb-bert-base - - sentence-transformers/stsb-bert-large - - sentence-transformers/stsb-distilbert-base - - sentence-transformers/stsb-roberta-base - - sentence-transformers/stsb-roberta-large - - sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens - - sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens - - sentence-transformers/xlm-r-base-en-ko-nli-ststb - - sentence-transformers/xlm-r-bert-base-nli-mean-tokens - - sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens - - sentence-transformers/xlm-r-large-en-ko-nli-ststb - - sentence-transformers/bert-base-nli-cls-token - - sentence-transformers/all-distilroberta-v1 - - sentence-transformers/multi-qa-MiniLM-L6-dot-v1 - - sentence-transformers/multi-qa-distilbert-cos-v1 - - sentence-transformers/multi-qa-distilbert-dot-v1 - - sentence-transformers/multi-qa-mpnet-base-cos-v1 - - sentence-transformers/multi-qa-mpnet-base-dot-v1 - - sentence-transformers/nli-distilroberta-base-v2 - - sentence-transformers/all-MiniLM-L6-v1 - - sentence-transformers/all-mpnet-base-v1 - - sentence-transformers/all-mpnet-base-v2 - - sentence-transformers/all-roberta-large-v1 - - sentence-transformers/allenai-specter - - sentence-transformers/average_word_embeddings_glove.6B.300d - - sentence-transformers/average_word_embeddings_glove.840B.300d - - sentence-transformers/average_word_embeddings_komninos - - sentence-transformers/average_word_embeddings_levy_dependency - - sentence-transformers/clip-ViT-B-32-multilingual-v1 - - sentence-transformers/clip-ViT-B-32 - - sentence-transformers/distilbert-base-nli-stsb-quora-ranking - - sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking - - sentence-transformers/distilroberta-base-paraphrase-v1 - - sentence-transformers/distiluse-base-multilingual-cased-v1 - - sentence-transformers/distiluse-base-multilingual-cased-v2 - - sentence-transformers/distiluse-base-multilingual-cased - - sentence-transformers/facebook-dpr-ctx_encoder-multiset-base - - sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base - - sentence-transformers/facebook-dpr-question_encoder-multiset-base - - sentence-transformers/facebook-dpr-question_encoder-single-nq-base - - sentence-transformers/gtr-t5-large - - sentence-transformers/gtr-t5-xl - - sentence-transformers/gtr-t5-xxl - - sentence-transformers/msmarco-MiniLM-L-12-v3 - - sentence-transformers/msmarco-MiniLM-L-6-v3 - - sentence-transformers/msmarco-MiniLM-L12-cos-v5 - - sentence-transformers/msmarco-MiniLM-L6-cos-v5 - - sentence-transformers/msmarco-bert-base-dot-v5 - - sentence-transformers/msmarco-bert-co-condensor - - sentence-transformers/msmarco-distilbert-base-dot-prod-v3 - - sentence-transformers/msmarco-distilbert-base-tas-b - - sentence-transformers/msmarco-distilbert-base-v2 - - sentence-transformers/msmarco-distilbert-base-v3 - - sentence-transformers/msmarco-distilbert-base-v4 - - sentence-transformers/msmarco-distilbert-cos-v5 - - sentence-transformers/msmarco-distilbert-dot-v5 - - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned - - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch - - sentence-transformers/msmarco-distilroberta-base-v2 - - sentence-transformers/msmarco-roberta-base-ance-firstp - - sentence-transformers/msmarco-roberta-base-v2 - - sentence-transformers/msmarco-roberta-base-v3 - - sentence-transformers/multi-qa-MiniLM-L6-cos-v1 - - sentence-transformers/nli-mpnet-base-v2 - - sentence-transformers/nli-roberta-base-v2 - - sentence-transformers/nq-distilbert-base-v1 - - sentence-transformers/paraphrase-MiniLM-L12-v2 - - sentence-transformers/paraphrase-MiniLM-L3-v2 - - sentence-transformers/paraphrase-MiniLM-L6-v2 - - sentence-transformers/paraphrase-TinyBERT-L6-v2 - - sentence-transformers/paraphrase-albert-base-v2 - - sentence-transformers/paraphrase-albert-small-v2 - - sentence-transformers/paraphrase-distilroberta-base-v1 - - sentence-transformers/paraphrase-distilroberta-base-v2 - - sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 - - sentence-transformers/paraphrase-multilingual-mpnet-base-v2 - - sentence-transformers/paraphrase-xlm-r-multilingual-v1 - - sentence-transformers/quora-distilbert-base - - sentence-transformers/quora-distilbert-multilingual - - sentence-transformers/sentence-t5-base - - sentence-transformers/sentence-t5-large - - sentence-transformers/sentence-t5-xxl - - sentence-transformers/sentence-t5-xl - - sentence-transformers/stsb-distilroberta-base-v2 - - sentence-transformers/stsb-mpnet-base-v2 - - sentence-transformers/stsb-roberta-base-v2 - - sentence-transformers/stsb-xlm-r-multilingual - - sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1 - - sentence-transformers/clip-ViT-L-14 - - sentence-transformers/clip-ViT-B-16 - - sentence-transformers/use-cmlm-multilingual - - sentence-transformers/all-MiniLM-L12-v1 - ``` - -!!! info - You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc. - See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers). - -!!! note "BAAI Embeddings example" - Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - - db = lancedb.connect("/tmp/db") - model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") - - class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - - table = db.create_table("words", schema=Words) - table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - - query = "greetings" - actual = table.search(query).limit(1).to_pydantic(Words)[0] - print(actual.text) - ``` -Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. - - -### Huggingface embedding models -We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")` - -Example usage - -```python -import lancedb -import pandas as pd - -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector - -model = get_registry().get("huggingface").create(name='facebook/bart-base') - -class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]}) -table = db.create_table("greets", schema=Words) -table.add(df) -query = "old greeting" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` - - -### Ollama embeddings -Generate embeddings via the [ollama](https://github.com/ollama/ollama-python) python library. More details: - -- [Ollama docs on embeddings](https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings) -- [Ollama blog on embeddings](https://ollama.com/blog/embedding-models) - -| Parameter | Type | Default Value | Description | -|------------------------|----------------------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| `name` | `str` | `nomic-embed-text` | The name of the model. | -| `host` | `str` | `http://localhost:11434` | The Ollama host to connect to. | -| `options` | `ollama.Options` or `dict` | `None` | Additional model parameters listed in the documentation for the Modelfile such as `temperature`. | -| `keep_alive` | `float` or `str` | `"5m"` | Controls how long the model will stay loaded into memory following the request. | -| `ollama_client_kwargs` | `dict` | `{}` | kwargs that can be past to the `ollama.Client`. | - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -func = get_registry().get("ollama").create(name="nomic-embed-text") - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words, mode="overwrite") -table.add([ - {"text": "hello world"}, - {"text": "goodbye world"} -]) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` - - -### OpenAI embeddings -LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"text-embedding-ada-002"` | The name of the model. | -| `dim` | `int` | Model default | For OpenAI's newer text-embedding-3 model, we can specify a dimensionality that is smaller than the 1536 size. This feature supports it | - - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -func = get_registry().get("openai").create(name="text-embedding-ada-002") - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words, mode="overwrite") -table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` - -### Instructor Embeddings -[Instructor](https://instructor-embedding.github.io/) is an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g. classification, retrieval, clustering, text evaluation, etc.) and domains (e.g. science, finance, etc.) by simply providing the task instruction, without any finetuning. - -If you want to calculate customized embeddings for specific sentences, you can follow the unified template to write instructions. - -!!! info - Represent the `domain` `text_type` for `task_objective`: - - * `domain` is optional, and it specifies the domain of the text, e.g. science, finance, medicine, etc. - * `text_type` is required, and it specifies the encoding unit, e.g. sentence, document, paragraph, etc. - * `task_objective` is optional, and it specifies the objective of embedding, e.g. retrieve a document, classify the sentence, etc. - -More information about the model can be found at the [source URL](https://github.com/xlang-ai/instructor-embedding). - -| Argument | Type | Default | Description | -|---|---|---|---| -| `name` | `str` | "hkunlp/instructor-base" | The name of the model to use | -| `batch_size` | `int` | `32` | The batch size to use when generating embeddings | -| `device` | `str` | `"cpu"` | The device to use when generating embeddings | -| `show_progress_bar` | `bool` | `True` | Whether to show a progress bar when generating embeddings | -| `normalize_embeddings` | `bool` | `True` | Whether to normalize the embeddings | -| `quantize` | `bool` | `False` | Whether to quantize the model | -| `source_instruction` | `str` | `"represent the docuement for retreival"` | The instruction for the source column | -| `query_instruction` | `str` | `"represent the document for retreiving the most similar documents"` | The instruction for the query | - - - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry, InstuctorEmbeddingFunction - -instructor = get_registry().get("instructor").create( - source_instruction="represent the docuement for retreival", - query_instruction="represent the document for retreiving the most similar documents" - ) - -class Schema(LanceModel): - vector: Vector(instructor.ndims()) = instructor.VectorField() - text: str = instructor.SourceField() - -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("test", schema=Schema, mode="overwrite") - -texts = [{"text": "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that..."}, - {"text": "The disparate impact theory is especially controversial under the Fair Housing Act because the Act..."}, - {"text": "Disparate impact in United States labor law refers to practices in employment, housing, and other areas that.."}] - -tbl.add(texts) -``` - -### Gemini Embeddings -With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide. -The Gemini Embedding Model API supports various task types: - -| Task Type | Description | -|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| -| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. | -| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API | -| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). | -| "`classification`" | Specifies that the embeddings will be used for classification. | -| "`clusering`" | Specifies that the embeddings will be used for clustering. | - - -Usage Example: - -```python -import lancedb -import pandas as pd -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - - -model = get_registry().get("gemini-text").create() - -class TextModel(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("test", schema=TextModel, mode="overwrite") - -tbl.add(df) -rs = tbl.search("hello").limit(1).to_pandas() -``` - -### Cohere Embeddings -Using cohere API requires cohere package, which can be installed using `pip install cohere`. Cohere embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. -You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API. - -Supported models are: -``` - * embed-english-v3.0 - * embed-multilingual-v3.0 - * embed-english-light-v3.0 - * embed-multilingual-light-v3.0 - * embed-english-v2.0 - * embed-english-light-v2.0 - * embed-multilingual-v2.0 -``` - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"embed-english-v2.0"` | The model ID of the cohere model to use. Supported base models for Text Embeddings: embed-english-v3.0, embed-multilingual-v3.0, embed-english-light-v3.0, embed-multilingual-light-v3.0, embed-english-v2.0, embed-english-light-v2.0, embed-multilingual-v2.0 | -| `source_input_type` | `str` | `"search_document"` | The type of input data to be used for the source column. | -| `query_input_type` | `str` | `"search_query"` | The type of input data to be used for the query. | - -Cohere supports following input types: - -| Input Type | Description | -|-------------------------|---------------------------------------| -| "`search_document`" | Used for embeddings stored in a vector| -| | database for search use-cases. | -| "`search_query`" | Used for embeddings of search queries | -| | run against a vector DB | -| "`semantic_similarity`" | Specifies the given text will be used | -| | for Semantic Textual Similarity (STS) | -| "`classification`" | Used for embeddings passed through a | -| | text classifier. | -| "`clustering`" | Used for the embeddings run through a | -| | clustering algorithm | - -Usage Example: - - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - cohere = EmbeddingFunctionRegistry - .get_instance() - .get("cohere") - .create(name="embed-multilingual-v2.0") - - class TextModel(LanceModel): - text: str = cohere.SourceField() - vector: Vector(cohere.ndims()) = cohere.VectorField() - - data = [ { "text": "hello world" }, - { "text": "goodbye world" }] - - db = lancedb.connect("~/.lancedb") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) - ``` - -### Jina Embeddings -Jina embeddings are used to generate embeddings for text and image data. -You also need to set the `JINA_API_KEY` environment variable to use the Jina API. - -You can find a list of supported models under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | - -Usage Example: - -```python - import os - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - os.environ['JINA_API_KEY'] = 'jina_*' - - jina_embed = EmbeddingFunctionRegistry.get_instance().get("jina").create(name="jina-embeddings-v2-base-en") - - - class TextModel(LanceModel): - text: str = jina_embed.SourceField() - vector: Vector(jina_embed.ndims()) = jina_embed.VectorField() - - - data = [{"text": "hello world"}, - {"text": "goodbye world"}] - - db = lancedb.connect("~/.lancedb-2") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) -``` - -### AWS Bedrock Text Embedding Functions -AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. -You can do so by using `awscli` and also add your session_token: -```shell -aws configure -aws configure set aws_session_token "" -``` -to ensure that the credentials are set up correctly, you can run the following command: -```shell -aws sts get-caller-identity -``` - -Supported Embedding modelIDs are: -* `amazon.titan-embed-text-v1` -* `cohere.embed-english-v3` -* `cohere.embed-multilingual-v3` - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| **name** | str | "amazon.titan-embed-text-v1" | The model ID of the bedrock model to use. Supported base models for Text Embeddings: amazon.titan-embed-text-v1, cohere.embed-english-v3, cohere.embed-multilingual-v3 | -| **region** | str | "us-east-1" | Optional name of the AWS Region in which the service should be called (e.g., "us-east-1"). | -| **profile_name** | str | None | Optional name of the AWS profile to use for calling the Bedrock service. If not specified, the default profile will be used. | -| **assumed_role** | str | None | Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not specified, the current active credentials will be used. | -| **role_session_name** | str | "lancedb-embeddings" | Optional name of the AWS IAM role session to use for calling the Bedrock service. If not specified, a "lancedb-embeddings" name will be used. | -| **runtime** | bool | True | Optional choice of getting different client to perform operations with the Amazon Bedrock service. | -| **max_retries** | int | 7 | Optional number of retries to perform when a request fails. | - -Usage Example: - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -model = get_registry().get("bedrock-text").create() - -class TextModel(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) -db = lancedb.connect("tmp_path") -tbl = db.create_table("test", schema=TextModel, mode="overwrite") - -tbl.add(df) -rs = tbl.search("hello").limit(1).to_pandas() -``` - -# IBM watsonx.ai Embeddings - -Generate text embeddings using IBM's watsonx.ai platform. - -## Supported Models - -You can find a list of supported models at [IBM watsonx.ai Documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). The currently supported model names are: - -- `ibm/slate-125m-english-rtrvr` -- `ibm/slate-30m-english-rtrvr` -- `sentence-transformers/all-minilm-l12-v2` -- `intfloat/multilingual-e5-large` - -## Parameters - -The following parameters can be passed to the `create` method: - -| Parameter | Type | Default Value | Description | -|------------|----------|----------------------------------|-----------------------------------------------------------| -| name | str | "ibm/slate-125m-english-rtrvr" | The model ID of the watsonx.ai model to use | -| api_key | str | None | Optional IBM Cloud API key (or set `WATSONX_API_KEY`) | -| project_id | str | None | Optional watsonx project ID (or set `WATSONX_PROJECT_ID`) | -| url | str | None | Optional custom URL for the watsonx.ai instance | -| params | dict | None | Optional additional parameters for the embedding model | - -## Usage Example - -First, the watsonx.ai library is an optional dependency, so must be installed seperately: - -``` -pip install ibm-watsonx-ai -``` - -Optionally set environment variables (if not passing credentials to `create` directly): - -```sh -export WATSONX_API_KEY="YOUR_WATSONX_API_KEY" -export WATSONX_PROJECT_ID="YOUR_WATSONX_PROJECT_ID" -``` - -```python -import os -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import EmbeddingFunctionRegistry - -watsonx_embed = EmbeddingFunctionRegistry - .get_instance() - .get("watsonx") - .create( - name="ibm/slate-125m-english-rtrvr", - # Uncomment and set these if not using environment variables - # api_key="your_api_key_here", - # project_id="your_project_id_here", - # url="your_watsonx_url_here", - # params={...}, - ) - -class TextModel(LanceModel): - text: str = watsonx_embed.SourceField() - vector: Vector(watsonx_embed.ndims()) = watsonx_embed.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"}, -] - -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("watsonx_test", schema=TextModel, mode="overwrite") - -tbl.add(data) - -rs = tbl.search("hello").limit(1).to_pandas() -print(rs) -``` ## Multi-modal embedding functions Multi-modal embedding functions allow you to query your table using both images and text. -### OpenClip embeddings -We support CLIP model embeddings using the open source alternative, [open-clip](https://github.com/mlfoundations/open_clip) which supports various customizations. It is registered as `open-clip` and supports the following customizations: +**Available Multi-modal Embeddings** : -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"ViT-B-32"` | The name of the model. | -| `pretrained` | `str` | `"laion2b_s34b_b79k"` | The name of the pretrained model to load. | -| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | -| `batch_size` | `int` | `64` | The number of images to process in a batch. | -| `normalize` | `bool` | `True` | Whether to normalize the input images before feeding them to the model. | - -This embedding function supports ingesting images as both bytes and urls. You can query them using both test and other images. - -!!! info - LanceDB supports ingesting images directly from accessible links. - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect(tmp_path) -func = get_registry.get("open-clip").create() - -class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: bytes = func.SourceField() # image bytes as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - -table = db.create_table("images", schema=Images) -labels = ["cat", "cat", "dog", "dog", "horse", "horse"] -uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", -] -# get each uri as bytes -image_bytes = [requests.get(uri).content for uri in uris] -table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) -) -``` -Now we can search using text from both the default vector column and the custom vector column -```python - -# text search -actual = table.search("man's best friend").limit(1).to_pydantic(Images)[0] -print(actual.label) # prints "dog" - -frombytes = ( - table.search("man's best friend", vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(frombytes.label) - -``` - -Because we're using a multi-modal embedding function, we can also search using images - -```python -# image search -query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" -image_bytes = requests.get(query_image_uri).content -query_image = Image.open(io.BytesIO(image_bytes)) -actual = table.search(query_image).limit(1).to_pydantic(Images)[0] -print(actual.label == "dog") - -# image search using a custom vector column -other = ( - table.search(query_image, vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(actual.label) - -``` - -### Imagebind embeddings -We have support for [imagebind](https://github.com/facebookresearch/ImageBind) model embeddings. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. - -This function is registered as `imagebind` and supports Audio, Video and Text modalities(extending to Thermal,Depth,IMU data): - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"imagebind_huge"` | Name of the model. | -| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | -| `normalize` | `bool` | `False` | set to `True` to normalize your inputs before model ingestion. | - -Below is an example demonstrating how the API works: - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect(tmp_path) -func = get_registry.get("imagebind").create() - -class ImageBindModel(LanceModel): - text: str - image_uri: str = func.SourceField() - audio_path: str - vector: Vector(func.ndims()) = func.VectorField() - -# add locally accessible image paths -text_list=["A dog.", "A car", "A bird"] -image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"] -audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"] - -# Load data -inputs = [ - {"text": a, "audio_path": b, "image_uri": c} - for a, b, c in zip(text_list, audio_paths, image_paths) -] - -#create table and add data -table = db.create_table("img_bind", schema=ImageBindModel) -table.add(inputs) -``` - -Now, we can search using any modality: - -#### image search -```python -query_image = "./assets/dog_image2.jpg" #download an image and enter that path here -actual = table.search(query_image).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "dog") -``` -#### audio search - -```python -query_audio = "./assets/car_audio2.wav" #download an audio clip and enter path here -actual = table.search(query_audio).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "car") -``` -#### Text search -You can add any input query and fetch the result as follows: -```python -query = "an animal which flies and tweets" -actual = table.search(query).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "bird") -``` - -If you have any questions about the embeddings API, supported models, or see a relevant model missing, please raise an issue [on GitHub](https://github.com/lancedb/lancedb/issues). - -### Jina Embeddings -Jina embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list -under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | - -Usage Example: - -```python - import os - import requests - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - import pandas as pd - - os.environ['JINA_API_KEY'] = 'jina_*' - - db = lancedb.connect("~/.lancedb") - func = get_registry().get("jina").create() - - - class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: bytes = func.SourceField() # image bytes as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - - - table = db.create_table("images", schema=Images) - labels = ["cat", "cat", "dog", "dog", "horse", "horse"] - uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", - ] - # get each uri as bytes - image_bytes = [requests.get(uri).content for uri in uris] - table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) - ) -``` +- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) +- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md) +- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) \ No newline at end of file diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py index 7f2f3a99..1923829d 100755 --- a/docs/test/md_testing.py +++ b/docs/test/md_testing.py @@ -19,6 +19,8 @@ excluded_globs = [ "../src/hybrid_search/hybrid_search.md", "../src/reranking/*.md", "../src/guides/tuning_retrievers/*.md", + "../src/embeddings/available_embedding_models/text_embedding_functions/*.md", + "../src/embeddings/available_embedding_models/multimodal_embedding_functions/*.md" ] python_prefix = "py" From a76186ee83f7fc837535299c1daa32b0a60a8413 Mon Sep 17 00:00:00 2001 From: Gagan Bhullar Date: Tue, 27 Aug 2024 18:03:42 -0600 Subject: [PATCH 25/34] fix(node): read consistency level fix (#1567) PR fixes #1565 --- node/src/test/test.ts | 24 ++++++++++++++++++++++++ rust/ffi/node/src/lib.rs | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/node/src/test/test.ts b/node/src/test/test.ts index 32377a39..a970bd8e 100644 --- a/node/src/test/test.ts +++ b/node/src/test/test.ts @@ -93,6 +93,30 @@ describe("LanceDB client", function () { const con = await lancedb.connect(uri); assert.deepEqual(await con.tableNames(), ["vectors"]); }); + + it("read consistency level", async function () { + const uri = await createTestDB(); + const db1 = await lancedb.connect({ uri }); + const table1 = await db1.openTable("vectors"); + + const db2 = await lancedb.connect({ + uri, + readConsistencyInterval: 0 + }) + const table2 = await db2.openTable("vectors"); + + assert.equal(await table2.countRows(), 2); + await table1.add([ + { + id: 3, + name: 'name_2', + price: 10, + is_active: true, + vector: [ 0, 0.1 ] + }, + ]); + assert.equal(await table2.countRows(), 3); + }); }); describe("when querying an existing dataset", function () { diff --git a/rust/ffi/node/src/lib.rs b/rust/ffi/node/src/lib.rs index 54223255..4e4105fa 100644 --- a/rust/ffi/node/src/lib.rs +++ b/rust/ffi/node/src/lib.rs @@ -49,7 +49,7 @@ fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> { fn database_new(mut cx: FunctionContext) -> JsResult { let path = cx.argument::(0)?.value(&mut cx); let read_consistency_interval = cx - .argument_opt(5) + .argument_opt(2) .and_then(|arg| arg.downcast::(&mut cx).ok()) .map(|v| v.value(&mut cx)) .map(std::time::Duration::from_secs_f64); From dd1c16bbafd34a350008fe31f8e43579532d8c28 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:07:57 +0530 Subject: [PATCH 26/34] docs: fix links, convert backslash to forward slash in mkdocs.yml (#1571) Co-authored-by: Ayush Chaurasia --- docs/mkdocs.yml | 52 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index e4346bcd..9685303e 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -122,20 +122,20 @@ nav: - Available models: - Overview: embeddings/default_embedding_functions.md - Text Embedding Functions: - - Sentence Transformers: embeddings\available_embedding_models\text_embedding_functions\sentence_transformers.md - - Huggingface Embedding Models: embeddings\available_embedding_models\text_embedding_functions\huggingface_embedding.md - - Ollama Embeddings: embeddings\available_embedding_models\text_embedding_functions\ollama_embedding.md - - OpenAI Embeddings: embeddings\available_embedding_models\text_embedding_functions\openai_embedding.md - - Instructor Embeddings: embeddings\available_embedding_models\text_embedding_functions\instructor_embedding.md - - Gemini Embeddings: embeddings\available_embedding_models\text_embedding_functions\gemini_embedding.md - - Cohere Embeddings: embeddings\available_embedding_models\text_embedding_functions\cohere_embedding.md - - Jina Embeddings: embeddings\available_embedding_models\text_embedding_functions\jina_embedding.md - - AWS Bedrock Text Embedding Functions: embeddings\available_embedding_models\text_embedding_functions\aws_bedrock_embedding.md - - IBM watsonx.ai Embeddings: embeddings\available_embedding_models\text_embedding_functions\ibm_watsonx_ai_embedding.md + - Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md + - Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md + - Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md + - OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md + - Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md + - Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md + - Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md + - Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md + - AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md + - IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md - Multimodal Embedding Functions: - - OpenClip embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\openclip_embedding.md - - Imagebind embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\imagebind_embedding.md - - Jina Embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\jina_multimodal_embedding.md + - OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md + - Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md + - Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md - User-defined embedding functions: embeddings/custom_embedding_function.md - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb @@ -234,20 +234,20 @@ nav: - Available models: - Overview: embeddings/default_embedding_functions.md - Text Embedding Functions: - - Sentence Transformers: embeddings\available_embedding_models\text_embedding_functions\sentence_transformers.md - - Huggingface Embedding Models: embeddings\available_embedding_models\text_embedding_functions\huggingface_embedding.md - - Ollama Embeddings: embeddings\available_embedding_models\text_embedding_functions\ollama_embedding.md - - OpenAI Embeddings: embeddings\available_embedding_models\text_embedding_functions\openai_embedding.md - - Instructor Embeddings: embeddings\available_embedding_models\text_embedding_functions\instructor_embedding.md - - Gemini Embeddings: embeddings\available_embedding_models\text_embedding_functions\gemini_embedding.md - - Cohere Embeddings: embeddings\available_embedding_models\text_embedding_functions\cohere_embedding.md - - Jina Embeddings: embeddings\available_embedding_models\text_embedding_functions\jina_embedding.md - - AWS Bedrock Text Embedding Functions: embeddings\available_embedding_models\text_embedding_functions\aws_bedrock_embedding.md - - IBM watsonx.ai Embeddings: embeddings\available_embedding_models\text_embedding_functions\ibm_watsonx_ai_embedding.md + - Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md + - Huggingface Embedding Models: embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md + - Ollama Embeddings: embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md + - OpenAI Embeddings: embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md + - Instructor Embeddings: embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md + - Gemini Embeddings: embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md + - Cohere Embeddings: embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md + - Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md + - AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md + - IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md - Multimodal Embedding Functions: - - OpenClip embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\openclip_embedding.md - - Imagebind embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\imagebind_embedding.md - - Jina Embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\jina_multimodal_embedding.md + - OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md + - Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md + - Jina Embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md - User-defined embedding functions: embeddings/custom_embedding_function.md - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb From 6f6eb170a903aad913f1ad697fbd2c73b401b02c Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:48:10 +0530 Subject: [PATCH 27/34] docs: revamp Python example: Overview page and remove redundant examples and notebooks (#1574) before: ![Screenshot 2024-08-29 131656](https://github.com/user-attachments/assets/81cb5d70-5dff-4e57-8bbe-3461327aed7d) After: ![Screenshot 2024-08-29 131715](https://github.com/user-attachments/assets/62109a37-7f66-4fd4-90ed-906a85472117) --------- Co-authored-by: Ayush Chaurasia --- docs/mkdocs.yml | 6 - docs/src/examples/examples_python.md | 33 +- docs/src/notebooks/code_qa_bot.ipynb | 378 ---------- docs/src/notebooks/multimodal_search.ipynb | 297 -------- .../notebooks/youtube_transcript_search.ipynb | 702 ------------------ 5 files changed, 19 insertions(+), 1397 deletions(-) delete mode 100644 docs/src/notebooks/code_qa_bot.ipynb delete mode 100644 docs/src/notebooks/multimodal_search.ipynb delete mode 100644 docs/src/notebooks/youtube_transcript_search.ipynb diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 9685303e..0b6be3e5 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -168,9 +168,6 @@ nav: - AI Agent: examples/python_examples/aiagent.md - Recommender System: examples/python_examples/recommendersystem.md - Miscellaneous: - - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - - Multimodal search using CLIP: notebooks/multimodal_search.ipynb - Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md - πŸ‘Ύ JavaScript: @@ -276,9 +273,6 @@ nav: - AI Agent: examples/python_examples/aiagent.md - Recommender System: examples/python_examples/recommendersystem.md - Miscellaneous: - - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - - Multimodal search using CLIP: notebooks/multimodal_search.ipynb - Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md - πŸ‘Ύ JavaScript: diff --git a/docs/src/examples/examples_python.md b/docs/src/examples/examples_python.md index 6c4a056a..2c7d17d6 100644 --- a/docs/src/examples/examples_python.md +++ b/docs/src/examples/examples_python.md @@ -1,17 +1,22 @@ -# Examples: Python +# Overview : Python Examples -To help you get started, we provide some examples, projects and applications that use the LanceDB Python API. You can always find the latest examples in our [VectorDB Recipes](https://github.com/lancedb/vectordb-recipes) repository. +To help you get started, we provide some examples, projects, and applications that use the LanceDB Python API. These examples are designed to get you right into the code with minimal introduction, enabling you to move from an idea to a proof of concept in minutes. -| Example | Interactive Envs | Scripts | -|-------- | ---------------- | ------ | -| | | | -| [Youtube transcript search bot](https://github.com/lancedb/vectordb-recipes/tree/main/examples/youtube_bot/) | Open In Colab| [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/youtube_bot/main.py)| -| [Langchain: Code Docs QA bot](https://github.com/lancedb/vectordb-recipes/tree/main/examples/Code-Documentation-QA-Bot/) | Open In Colab| [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/Code-Documentation-QA-Bot/main.py) | -| [AI Agents: Reducing Hallucination](https://github.com/lancedb/vectordb-recipes/tree/main/examples/reducing_hallucinations_ai_agents/) | Open In Colab| [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/reducing_hallucinations_ai_agents/main.py)| -| [Multimodal CLIP: DiffusionDB](https://github.com/lancedb/vectordb-recipes/tree/main/examples/multimodal_clip/) | Open In Colab| [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/multimodal_clip/main.py) | -| [Multimodal CLIP: Youtube videos](https://github.com/lancedb/vectordb-recipes/tree/main/examples/multimodal_video_search/) | Open In Colab| [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/multimodal_video_search/main.py) | -| [Movie Recommender](https://github.com/lancedb/vectordb-recipes/tree/main/examples/movie-recommender/) | Open In Colab | [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/movie-recommender/main.py) | -| [Audio Search](https://github.com/lancedb/vectordb-recipes/tree/main/examples/audio_search/) | Open In Colab | [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/audio_search/main.py) | -| [Multimodal Image + Text Search](https://github.com/lancedb/vectordb-recipes/tree/main/examples/multimodal_search/) | Open In Colab | [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/multimodal_search/main.py) | -| [Evaluating Prompts with Prompttools](https://github.com/lancedb/vectordb-recipes/tree/main/examples/prompttools-eval-prompts/) | Open In Colab | | +You can find the latest examples in our [VectorDB Recipes](https://github.com/lancedb/vectordb-recipes) repository. + +**Introduction** + +Explore applied examples available as Colab notebooks or Python scripts to integrate into your applications. You can also checkout our blog posts related to the particular example for deeper understanding. + +| Explore | Description | +|----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Build from Scratch with LanceDB πŸ› οΈπŸš€](python_examples/build_from_scratch.md) | Start building your GenAI applications from the ground up using LanceDB's efficient vector-based document retrieval capabilities! Get started quickly with a solid foundation. | +| [Multimodal Search with LanceDB πŸ€Ήβ€β™‚οΈπŸ”](python_examples/multimodal.md) | Combine text and image queries to find the most relevant results using LanceDB’s multimodal capabilities. Leverage the efficient vector-based similarity search. | +| [RAG: Revolutionize Information Retrieval with LanceDB πŸ”“πŸ§](python_examples/rag.md) | Build RAG (Retrieval-Augmented Generation) with LanceDB for efficient vector-based information retrieval and more accurate responses from AI. | +| [Vector Search: Unlock Efficient Document Retrieval πŸ”“πŸ‘€](python_examples/vector_search.md) | Use LanceDB's vector search capabilities to perform efficient and accurate similarity searches, enabling rapid discovery and retrieval of relevant documents in Large datasets. | +| [Chatbot Application with LanceDB πŸ€–](python_examples/chatbot.md) | Create chatbots that retrieves relevant context for coherent and context-aware replies, enhancing user experience through advanced conversational AI. | +| [Evaluation: Assessing Text Performance with Precision πŸ“ŠπŸ’‘](python_examples/evaluations.md) | Develop evaluation applications that allows you to input reference and candidate texts to measure their performance across various metrics. | +| [AI Agents: Intelligent Collaboration πŸ€–](python_examples/aiagent.md) | Enable AI agents to communicate and collaborate efficiently through dense vector representations, achieving shared goals seamlessly. | +| [Recommender Systems: Personalized Discovery πŸΏπŸ“Ί](python_examples/recommendersystem.md) | Deliver personalized experiences by efficiently storing and querying item embeddings with LanceDB's powerful vector database capabilities. | +| **Miscellaneous Examples🌟** | Find other unique examples and creative solutions using LanceDB, showcasing the flexibility and broad applicability of the platform. | diff --git a/docs/src/notebooks/code_qa_bot.ipynb b/docs/src/notebooks/code_qa_bot.ipynb deleted file mode 100644 index 5ff18a0c..00000000 --- a/docs/src/notebooks/code_qa_bot.ipynb +++ /dev/null @@ -1,378 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "13cb272e", - "metadata": {}, - "source": [ - "# Code documentation Q&A bot example with LangChain\n", - "\n", - "This Q&A bot will allow you to query your own documentation easily using questions. We'll also demonstrate the use of LangChain and LanceDB using the OpenAI API. \n", - "\n", - "In this example we'll use Pandas 2.0 documentation, but, this could be replaced for your own docs as well\n", - "\n", - "\"Open\n", - "\n", - "Scripts - [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/Code-Documentation-QA-Bot/main.py) [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](./examples/Code-Documentation-QA-Bot/index.js)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "66638d6c", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --quiet openai langchain\n", - "!pip install --quiet -U lancedb" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "d1cdcac3", - "metadata": {}, - "source": [ - "First, let's get some setup out of the way. As we're using the OpenAI API, ensure that you've set your key (and organization if needed):" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "58ee1868", - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "import os\n", - "\n", - "# Configuring the environment variable OPENAI_API_KEY\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "client = OpenAI()\n", - "assert len(client.models.list().data) > 0" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "34f524d3", - "metadata": {}, - "source": [ - "# Loading in our code documentation, generating embeddings and storing our documents in LanceDB\n", - "\n", - "We're going to use the power of LangChain to help us create our Q&A bot. It comes with several APIs that can make our development much easier as well as a LanceDB integration for vectorstore." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "b55d22f1", - "metadata": {}, - "outputs": [], - "source": [ - "import lancedb\n", - "import re\n", - "import pickle\n", - "import requests\n", - "import zipfile\n", - "from pathlib import Path\n", - "\n", - "from langchain.document_loaders import UnstructuredHTMLLoader\n", - "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from langchain.vectorstores import LanceDB\n", - "from langchain.llms import OpenAI\n", - "from langchain.chains import RetrievalQA" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "56cc6d50", - "metadata": {}, - "source": [ - "To make this easier, we've downloaded Pandas documentation and stored the raw HTML files for you to download. We'll download them and then use LangChain's HTML document readers to parse them and store them in LanceDB as a vector store, along with relevant metadata." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7da77e75", - "metadata": {}, - "outputs": [], - "source": [ - "pandas_docs = requests.get(\"https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip\")\n", - "with open('/tmp/pandas.documentation.zip', 'wb') as f:\n", - " f.write(pandas_docs.content)\n", - "\n", - "file = zipfile.ZipFile(\"/tmp/pandas.documentation.zip\")\n", - "file.extractall(path=\"/tmp/pandas_docs\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ae42496c", - "metadata": {}, - "source": [ - "We'll create a simple helper function that can help to extract metadata, so we can use this downstream when we're wanting to query with filters. In this case, we want to keep the lineage of the uri or path for each document that we process:" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "d171d062", - "metadata": {}, - "outputs": [], - "source": [ - "def get_document_title(document):\n", - " m = str(document.metadata[\"source\"])\n", - " title = re.findall(\"pandas.documentation(.*).html\", m)\n", - " if title[0] is not None:\n", - " return(title[0])\n", - " return ''" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "130162ad", - "metadata": {}, - "source": [ - "# Pre-processing and loading the documentation\n", - "\n", - "Next, let's pre-process and load the documentation. To make sure we don't need to do this repeatedly if we were updating code, we're caching it using pickle so we can retrieve it again (this could take a few minutes to run the first time you do it). We'll also add some more metadata to the docs here such as the title and version of the code:" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "33bfe7d8", - "metadata": {}, - "outputs": [], - "source": [ - "docs_path = Path(\"docs.pkl\")\n", - "docs = []\n", - "\n", - "if not docs_path.exists():\n", - " for p in Path(\"/tmp/pandas_docs/pandas.documentation\").rglob(\"*.html\"):\n", - " print(p)\n", - " if p.is_dir():\n", - " continue\n", - " loader = UnstructuredHTMLLoader(p)\n", - " raw_document = loader.load()\n", - " \n", - " m = {}\n", - " m[\"title\"] = get_document_title(raw_document[0])\n", - " m[\"version\"] = \"2.0rc0\"\n", - " raw_document[0].metadata = raw_document[0].metadata | m\n", - " raw_document[0].metadata[\"source\"] = str(raw_document[0].metadata[\"source\"])\n", - " docs = docs + raw_document\n", - "\n", - " with docs_path.open(\"wb\") as fh:\n", - " pickle.dump(docs, fh)\n", - "else:\n", - " with docs_path.open(\"rb\") as fh:\n", - " docs = pickle.load(fh)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c3852dd3", - "metadata": {}, - "source": [ - "# Generating embeddings from our docs\n", - "\n", - "Now that we have our raw documents loaded, we need to pre-process them to generate embeddings:" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "82230563", - "metadata": {}, - "outputs": [], - "source": [ - "text_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=1000,\n", - " chunk_overlap=200,\n", - ")\n", - "documents = text_splitter.split_documents(docs)\n", - "embeddings = OpenAIEmbeddings()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "43e68215", - "metadata": {}, - "source": [ - "# Storing and querying with LanceDB\n", - "\n", - "Let's connect to LanceDB so we can store our documents. We'll create a Table to store them in:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "74780a58", - "metadata": {}, - "outputs": [], - "source": [ - "db = lancedb.connect('/tmp/lancedb')\n", - "table = db.create_table(\"pandas_docs\", data=[\n", - " {\"vector\": embeddings.embed_query(\"Hello World\"), \"text\": \"Hello World\", \"id\": \"1\"}\n", - "], mode=\"overwrite\")\n", - "docsearch = LanceDB.from_documents(documents, embeddings, connection=table)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3cb1dc5d", - "metadata": {}, - "source": [ - "Now let's create our RetrievalQA chain using the LanceDB vector store:" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "6a5891ad", - "metadata": {}, - "outputs": [], - "source": [ - "qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"stuff\", retriever=docsearch.as_retriever())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "28d93b85", - "metadata": {}, - "source": [ - "And that's it! We're all set up. The next step is to run some queries, let's try a few:" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "70d88316", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "' The major differences in pandas 2.0 include installing optional dependencies with pip extras, the ability to use any numpy numeric dtype in an Index, and enhancements, notable bug fixes, backwards incompatible API changes, deprecations, and performance improvements.'" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"What are the major differences in pandas 2.0?\"\n", - "qa.run(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "85a0397c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "' 2.0.0rc0'" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"What's the current version of pandas?\"\n", - "qa.run(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "923f86c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "' Optional dependencies can be installed with pip install \"pandas[all]\" or \"pandas[performance]\". This will install all recommended performance dependencies such as numexpr, bottleneck and numba.'" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"How do I make use of installing optional dependencies?\"\n", - "qa.run(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "02082f83", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\" \\n\\nPandas 2.0 includes a number of API breaking changes, such as increased minimum versions for dependencies, the use of os.linesep for DataFrame.to_csv's line_terminator, and reorganization of the library. See the release notes for a full list of changes.\"" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"What are the backwards incompatible API changes in Pandas 2.0?\"\n", - "qa.run(query)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75cea547", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/notebooks/multimodal_search.ipynb b/docs/src/notebooks/multimodal_search.ipynb deleted file mode 100644 index ddbda8d7..00000000 --- a/docs/src/notebooks/multimodal_search.ipynb +++ /dev/null @@ -1,297 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![example](https://github.com/lancedb/vectordb-recipes/assets/15766192/799f94a1-a01d-4a5b-a627-2a733bbb4227)\n", - "\n", - " \"Open| [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/multimodal_clip/main.py) |" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n", - "\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" - ] - } - ], - "source": [ - "!pip install --quiet -U lancedb\n", - "!pip install --quiet gradio transformers torch torchvision" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "\n", - "import PIL\n", - "import duckdb\n", - "import lancedb" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## First run setup: Download data and pre-process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "### Get dataset\n", - "\n", - "!wget https://eto-public.s3.us-west-2.amazonaws.com/datasets/diffusiondb_lance.tar.gz\n", - "!tar -xvf diffusiondb_lance.tar.gz\n", - "!mv diffusiondb_test rawdata.lance\n" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# remove null prompts\n", - "import lance\n", - "import pyarrow.compute as pc\n", - "\n", - "# download s3://eto-public/datasets/diffusiondb/small_10k.lance to this uri\n", - "data = lance.dataset(\"~/datasets/rawdata.lance\").to_table()\n", - "\n", - "# First data processing and full-text-search index\n", - "db = lancedb.connect(\"~/datasets/demo\")\n", - "tbl = db.create_table(\"diffusiondb\", data.filter(~pc.field(\"prompt\").is_null()))\n", - "tbl = tbl.create_fts_index([\"prompt\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create / Open LanceDB Table" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "db = lancedb.connect(\"~/datasets/demo\")\n", - "tbl = db.open_table(\"diffusiondb\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create CLIP embedding function for the text" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast\n", - "\n", - "MODEL_ID = \"openai/clip-vit-base-patch32\"\n", - "\n", - "tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)\n", - "model = CLIPModel.from_pretrained(MODEL_ID)\n", - "processor = CLIPProcessor.from_pretrained(MODEL_ID)\n", - "\n", - "def embed_func(query):\n", - " inputs = tokenizer([query], padding=True, return_tensors=\"pt\")\n", - " text_features = model.get_text_features(**inputs)\n", - " return text_features.detach().numpy()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Search functions for Gradio" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def find_image_vectors(query):\n", - " emb = embed_func(query)\n", - " code = (\n", - " \"import lancedb\\n\"\n", - " \"db = lancedb.connect('~/datasets/demo')\\n\"\n", - " \"tbl = db.open_table('diffusiondb')\\n\\n\"\n", - " f\"embedding = embed_func('{query}')\\n\"\n", - " \"tbl.search(embedding).limit(9).to_pandas()\"\n", - " )\n", - " return (_extract(tbl.search(emb).limit(9).to_pandas()), code)\n", - "\n", - "def find_image_keywords(query):\n", - " code = (\n", - " \"import lancedb\\n\"\n", - " \"db = lancedb.connect('~/datasets/demo')\\n\"\n", - " \"tbl = db.open_table('diffusiondb')\\n\\n\"\n", - " f\"tbl.search('{query}').limit(9).to_pandas()\"\n", - " )\n", - " return (_extract(tbl.search(query).limit(9).to_pandas()), code)\n", - "\n", - "def find_image_sql(query):\n", - " code = (\n", - " \"import lancedb\\n\"\n", - " \"import duckdb\\n\"\n", - " \"db = lancedb.connect('~/datasets/demo')\\n\"\n", - " \"tbl = db.open_table('diffusiondb')\\n\\n\"\n", - " \"diffusiondb = tbl.to_lance()\\n\"\n", - " f\"duckdb.sql('{query}').to_df()\"\n", - " ) \n", - " diffusiondb = tbl.to_lance()\n", - " return (_extract(duckdb.sql(query).to_df()), code)\n", - "\n", - "def _extract(df):\n", - " image_col = \"image\"\n", - " return [(PIL.Image.open(io.BytesIO(row[image_col])), row[\"prompt\"]) for _, row in df.iterrows()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup Gradio interface" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running on local URL: http://127.0.0.1:7881\n", - "\n", - "To create a public link, set `share=True` in `launch()`.\n" - ] - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import gradio as gr\n", - "\n", - "\n", - "with gr.Blocks() as demo:\n", - " with gr.Row():\n", - " with gr.Tab(\"Embeddings\"):\n", - " vector_query = gr.Textbox(value=\"portraits of a person\", show_label=False)\n", - " b1 = gr.Button(\"Submit\")\n", - " with gr.Tab(\"Keywords\"):\n", - " keyword_query = gr.Textbox(value=\"ninja turtle\", show_label=False)\n", - " b2 = gr.Button(\"Submit\")\n", - " with gr.Tab(\"SQL\"):\n", - " sql_query = gr.Textbox(value=\"SELECT * from diffusiondb WHERE image_nsfw >= 2 LIMIT 9\", show_label=False)\n", - " b3 = gr.Button(\"Submit\")\n", - " with gr.Row():\n", - " code = gr.Code(label=\"Code\", language=\"python\")\n", - " with gr.Row():\n", - " gallery = gr.Gallery(\n", - " label=\"Found images\", show_label=False, elem_id=\"gallery\"\n", - " ).style(columns=[3], rows=[3], object_fit=\"contain\", height=\"auto\") \n", - " \n", - " b1.click(find_image_vectors, inputs=vector_query, outputs=[gallery, code])\n", - " b2.click(find_image_keywords, inputs=keyword_query, outputs=[gallery, code])\n", - " b3.click(find_image_sql, inputs=sql_query, outputs=[gallery, code])\n", - " \n", - "demo.launch()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.11.4 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/docs/src/notebooks/youtube_transcript_search.ipynb b/docs/src/notebooks/youtube_transcript_search.ipynb deleted file mode 100644 index bd6f2bdc..00000000 --- a/docs/src/notebooks/youtube_transcript_search.ipynb +++ /dev/null @@ -1,702 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "42bf01fb", - "metadata": {}, - "source": [ - "# Youtube Transcript Search QA Bot\n", - "\n", - "This Q&A bot will allow you to search through youtube transcripts using natural language! By going through this notebook, we'll introduce how you can use LanceDB to store and manage your data easily.\n", - "\n", - "\n", - "\"Open\n", - "\n", - "Scripts - [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/youtube_bot/main.py) [![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=for-the-badge&logo=javascript&logoColor=%23F7DF1E)](./examples/youtube_bot/index.js)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "48547ddb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install --quiet openai datasets\n", - "!pip install --quiet -U lancedb" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "22e570f4", - "metadata": {}, - "source": [ - "## Download the data\n", - "\n", - "For this dataset we're using the HuggingFace dataset `jamescalam/youtube-transcriptions`.\n", - "\n", - "From the [website](https://huggingface.co/datasets/jamescalam/youtube-transcriptions):\n", - "\n", - "```\n", - "The YouTube transcriptions dataset contains technical tutorials (currently from James Briggs, Daniel Bourke, and AI Coffee Break) transcribed using OpenAI's Whisper (large). Each row represents roughly a sentence-length chunk of text alongside the video URL and timestamp.\n", - "```\n", - "\n", - "We'll use the training split with 700 videos and 208619 sentences" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a8987fcb", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset json (/Users/changshe/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" - ] - }, - { - "data": { - "text/plain": [ - "Dataset({\n", - " features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],\n", - " num_rows: 208619\n", - "})" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from datasets import load_dataset\n", - "\n", - "data = load_dataset('jamescalam/youtube-transcriptions', split='train')\n", - "data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5ac2b6a3", - "metadata": {}, - "source": [ - "## Prepare context\n", - "\n", - "Each item in the dataset contains just a short chunk of text. We'll need to merge a bunch of these chunks together on a rolling basis. For this demo, we'll merge 20 rows and step over 4 rows at a time. LanceDB offers chaining support so you can write declarative, readable and parameterized queries. Here we serialize to Pandas as well:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "121a7087", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titlepublishedurlvideo_idchannel_ididtextstartend
177622$5 MILLION AI for FREE2022-08-12 15:18:07https://youtu.be/3EjtHs_lXnk3EjtHs_lXnkUCfzlCWGWYyIQ0aLC5w48gBQ3EjtHs_lXnk-t0.0Imagine an AI where all in the same model you ...0.024.0
\n", - "
" - ], - "text/plain": [ - " title published \\\n", - "177622 $5 MILLION AI for FREE 2022-08-12 15:18:07 \n", - "\n", - " url video_id channel_id \\\n", - "177622 https://youtu.be/3EjtHs_lXnk 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ \n", - "\n", - " id text \\\n", - "177622 3EjtHs_lXnk-t0.0 Imagine an AI where all in the same model you ... \n", - "\n", - " start end \n", - "177622 0.0 24.0 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from lancedb.context import contextualize\n", - "\n", - "df = (contextualize(data.to_pandas())\n", - " .groupby(\"title\").text_col(\"text\")\n", - " .window(20).stride(4)\n", - " .to_pandas())\n", - "df.head(1)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3044e0b0", - "metadata": {}, - "source": [ - "## Create embedding function\n", - "To create embeddings out of the text, we'll call the OpenAI embeddings API to get embeddings.\n", - "Make sure you have an API key setup and that your account has available credits." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c8104467", - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "import os\n", - "\n", - "# Configuring the environment variable OPENAI_API_KEY\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " # OR set the key here as a variable\n", - " os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "\n", - "client = OpenAI()\n", - "assert len(client.models.list().data) > 0" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "db586267", - "metadata": {}, - "source": [ - "We'll use the ada2 text embeddings model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8eefc159", - "metadata": {}, - "outputs": [], - "source": [ - "def embed_func(c):\n", - " rs = client.embeddings.create(input=c, model=\"text-embedding-ada-002\")\n", - " return [\n", - " data.embedding\n", - " for data in rs.data\n", - " ]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "2106b5bb", - "metadata": {}, - "source": [ - "## Create the LanceDB Table\n", - "OpenAI API often fails or times out. So LanceDB's API provides retry and throttling features behind the scenes to make it easier to call these APIs. In LanceDB the primary abstraction you'll use to work with your data is a Table. A Table is designed to store large numbers of columns and huge quantities of data! For those interested, a LanceDB is columnar-based, and uses Lance, an open data format to store data." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "13f15068", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c6f1c76d9567421d88911923388d2530", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/49 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titlepublishedurlvideo_idchannel_ididtextstartendvector
0$5 MILLION AI for FREE2022-08-12 15:18:07https://youtu.be/3EjtHs_lXnk3EjtHs_lXnkUCfzlCWGWYyIQ0aLC5w48gBQ3EjtHs_lXnk-t0.0Imagine an AI where all in the same model you ...0.024.0[-0.02439424, -0.0007703846, 0.016625028, -0.0...
\n", - "" - ], - "text/plain": [ - " title published url \\\n", - "0 $5 MILLION AI for FREE 2022-08-12 15:18:07 https://youtu.be/3EjtHs_lXnk \n", - "\n", - " video_id channel_id id \\\n", - "0 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ 3EjtHs_lXnk-t0.0 \n", - "\n", - " text start end \\\n", - "0 Imagine an AI where all in the same model you ... 0.0 24.0 \n", - "\n", - " vector \n", - "0 [-0.02439424, -0.0007703846, 0.016625028, -0.0... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import lancedb\n", - "from lancedb.embeddings import with_embeddings\n", - "\n", - "data = with_embeddings(embed_func, df, show_progress=True)\n", - "data.to_pandas().head(1)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "53e4bff1", - "metadata": {}, - "source": [ - "Now we're ready to save the data and create a new LanceDB table" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "92d53abd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "48935" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!rm -rf /tmp/lancedb\n", - "\n", - "db = lancedb.connect(\"/tmp/lancedb\")\n", - "tbl = db.create_table(\"chatbot\", data)\n", - "len(tbl)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8ef34fca", - "metadata": {}, - "source": [ - "The table is backed by a Lance dataset so it's easy to integrate into other tools (e.g., pandas)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "22892cfd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titlepublishedurlvideo_idchannel_ididtextstartendvector
0$5 MILLION AI for FREE2022-08-12 15:18:07https://youtu.be/3EjtHs_lXnk3EjtHs_lXnkUCfzlCWGWYyIQ0aLC5w48gBQ3EjtHs_lXnk-t0.0Imagine an AI where all in the same model you ...0.024.0[-0.02439424, -0.0007703846, 0.016625028, -0.0...
\n", - "
" - ], - "text/plain": [ - " title published url \\\n", - "0 $5 MILLION AI for FREE 2022-08-12 15:18:07 https://youtu.be/3EjtHs_lXnk \n", - "\n", - " video_id channel_id id \\\n", - "0 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ 3EjtHs_lXnk-t0.0 \n", - "\n", - " text start end \\\n", - "0 Imagine an AI where all in the same model you ... 0.0 24.0 \n", - "\n", - " vector \n", - "0 [-0.02439424, -0.0007703846, 0.016625028, -0.0... " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tbl.to_pandas().head(1)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "23afc2f9", - "metadata": {}, - "source": [ - "## Create and answer the prompt\n", - "\n", - "For a given context (bunch of text), we can ask the OpenAI Completion API to answer an arbitrary question using the following prompt:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "06d8b867", - "metadata": {}, - "outputs": [], - "source": [ - "def create_prompt(query, context):\n", - " limit = 3750\n", - "\n", - " prompt_start = (\n", - " \"Answer the question based on the context below.\\n\\n\"+\n", - " \"Context:\\n\"\n", - " )\n", - " prompt_end = (\n", - " f\"\\n\\nQuestion: {query}\\nAnswer:\"\n", - " )\n", - " # append contexts until hitting limit\n", - " for i in range(1, len(context)):\n", - " if len(\"\\n\\n---\\n\\n\".join(context.text[:i])) >= limit:\n", - " prompt = (\n", - " prompt_start +\n", - " \"\\n\\n---\\n\\n\".join(context.text[:i-1]) +\n", - " prompt_end\n", - " )\n", - " break\n", - " elif i == len(context)-1:\n", - " prompt = (\n", - " prompt_start +\n", - " \"\\n\\n---\\n\\n\".join(context.text) +\n", - " prompt_end\n", - " )\n", - " return prompt" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "e09c5142", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def complete(prompt):\n", - " res = client.completions.create(\n", - " model='text-davinci-003',\n", - " prompt=prompt,\n", - " temperature=0,\n", - " max_tokens=400,\n", - " top_p=1,\n", - " frequency_penalty=0,\n", - " presence_penalty=0,\n", - " stop=None\n", - " )\n", - " return res.choices[0].text\n", - "\n", - "# check that it works\n", - "query = \"who was the 12th person on the moon and when did they land?\"\n", - "complete(query)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "28705959", - "metadata": {}, - "source": [ - "## Let's put it all together now" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "c71f5b31", - "metadata": {}, - "outputs": [], - "source": [ - "query = (\"Which training method should I use for sentence transformers \"\n", - " \"when I only have pairs of related sentences?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "603ba92c", - "metadata": {}, - "outputs": [], - "source": [ - "# Embed the question\n", - "emb = embed_func(query)[0]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "559a095b", - "metadata": {}, - "source": [ - "\n", - "Again we'll use LanceDB's chaining query API. This time, we'll perform similarity search to find similar embeddings to our query. We can easily tweak the parameters in the query to produce the best result." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "80db5c15", - "metadata": {}, - "outputs": [], - "source": [ - "# Use LanceDB to get top 3 most relevant context\n", - "context = tbl.search(emb).limit(3).to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "8fcef773", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'NLI with multiple negative ranking loss.'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get the answer from completion API\n", - "prompt = create_prompt(query, context)\n", - "complete(prompt)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "25714299", - "metadata": {}, - "outputs": [ - { - "data": { - "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAUDBAoKDQgNCgoICAoKCAgICAoICAgICAgKCAgICAgICAgIChALCAgOCggIDRUNDhERExMTCA0WGBYSGBASExIBBQUFCAcIDwkJDxUQDxAVFRISEhUVFRIVEhUVFRUVFRUSFhUSEhIVEhYSFRUWFRUWFRUVFRUVFRUVFRUVFRcVFf/AABEIAWgB4AMBIgACEQEDEQH/xAAdAAEAAAcBAQAAAAAAAAAAAAAAAgMEBQYHCAEJ/8QAXBAAAgIBAgMDCQMFCAwKCgMAAQIDBAAFEQYSIQcTMQgUGCJBUVSU1TJhcRUjgZGxJEJVlaHB0dQWMzQ1RFJTdZO00/AXJTZDVmKCkrPhRXJzdHaFsrXS8aKjpf/EABsBAQADAQEBAQAAAAAAAAAAAAABAgMEBQYH/8QAQhEAAgIBAgQDBgMEBwYHAAAAAAECEQMEIQUSMUEGUWETIjJxgaEUkfAVscHRQlJTYnKS0hYjgpPh8QdDc6OywtP/2gAMAwEAAhEDEQA/AOMsYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMZ0z6FXEPxnD/AM1qP07HoVcQ/GcP/Naj9OwDmbGdM+hVxD8Zw/8ANaj9Ox6FXEPxnD/zWo/TsA5mxnTPoVcQ/GcP/Naj9Ox6FXEPxnD/AM1qP07AOZsZ0r6GHEHxmgfNaj9Pzz0MOIPjNA+a1H6fk8rItHNeM6U9DDiD4zQPmtR+n49DDiD4zQPmtR+n45WLRzXjOl4/It4hP+GaB81qP07IvQq4h+M4f+a1H6dkEnM2M6Z9CriH4zh/5rUfp2PQq4h+M4f+a1H6dgHM2M6Z9CriH4zh/wCa1H6dj0KuIfjOH/mtR+nYBzNjOmfQq4h+M4f+a1H6dj0KuIfjOH/mtR+nYBzNjOmfQq4h+M4f+a1H6dj0KuIfjOH/AJrUfp2AczYzpn0KuIfjOH/mtR+nY9CriH4zh/5rUfp2AczYzpn0KuIfjOH/AJrUfp2PQq4h+M4f+a1H6dgHM2M6Z9CriH4zh/5rUfp2PQq4h+M4f+a1H6dgHM2M6Z9CriH4zh/5rUfp2PQq4h+M4f8AmtR+nYBzNjOmfQq4h+M4f+a1H6dj0KuIfjOH/mtR+nYBzNjOmfQq4h+M4f8AmtR+nY9CriH4zh/5rUfp2AczYzpn0KuIfjOH/mtR+nY9CriH4zh/5rUfp2AczYzpn0KuIfjOH/mtR+nY9CriH4zh/wCa1H6dgHM2M6Z9CriH4zh/5rUfp2PQq4h+M4f+a1H6dgHM2M6Z9CriH4zh/wCa1H6dnjeRXxCP8M4f+a1H6dgHM+M6W9C/iD4zQPmtR+nZEfIr4hH+G8P/ADWo/Tsmgcz4zpb0L+IPjNA+a1H6dnq+RbxCf8M4f+a1H6dihZzRjOmfQq4h+M4f+a1H6dj0KuIfjOH/AJrUfp2QDmbGdM+hVxD8Zw/81qP07HoVcQ/GcP8AzWo/TsA5mxnTPoVcQ/GcP/Naj9Ox6FXEPxnD/wA1qP07AOZsZ0z6FXEPxnD/AM1qP07HoVcQ/GcP/Naj9OwDmbGdM+hVxD8Zw/8ANaj9Ox6FXEPxnD/zWo/TsA5mxnTPoVcQ/GcP/Naj9OzxvIr4hH+G8P8AzWo/TsA5nxnS3oX8Q/GaB81qP07PV8i3iE/4ZoHzWo/TsmhZzRjOmfQq4h+M4f8AmtR+nY9CriH4zh/5rUfp2QDv/GMYAxjGAMYxgEmeP2j9OSMrckTx7dR+n7svGXYrJHsYU+zr7cj7oe79uUynbKtG3yJbBbnioB4ZFjGVLDGMxXtV4w/JFOe35u9xYShkijkWJ+RnVGkDMpGycwYj3An2ZrgwzzZI44K5Sailsrb2XWl1IbpWZVjNVdiPbVX1+S3ElaWlLXjimVJZo5e+jdnR2QoBtyMEB3/yq5tRjtm2u0OfRZng1EeWaq1t3Vrpa6ERkpK0e4zQlDylILF5KVbTrE/eX2pRWBaiWORVlaNrQTkJEPIjSe08ozfSnfNdfwvVaFxWphyOS5optNteezdfWhGal0PcYxnnlhjGMAYzVHaT296Vo9mWrai1GSWOKOVjWggki5ZFLKA0lhDzbA79M2pDIGCkeBAI/TnZqNBqNPjhlywcY5FcG+kltuvzX5lVJN0uxHjGM4ywxjGAMYxgDGMxjtS4vTR6Ny68TWFrRq5iRxG0nNIkfKHIIU+vv4ezLQg5yUY7t7IGT4zlH006n8D2/nYf9lj006n8D2/nYf8AZZ6n7D1v9m/zX8yntI+Z1djMM7GOPo9eow3Y4XqrLJOndSSLIy9zM8W5dVAO/Jv4e3MzzzMmOWOThJU06a9UXTsYxjKAZLnPh+OTMlT+zJXUhkvBOaO7a/KJr6BcFSShPcY1orPeRWI4lHevKnJyOhO47rfff99mw+yHtAq67Tht1hyhi0U8LMGkqzx7d5BIV6EgFWB2G6yKdhvnbk0eaGNZZRfK+jKKSboy9UJycke2YX2xdpFXQKcluyGlPOkNeujKktmWQ9I4y3QAKHcn2LGx2J2BwTsN8omvxDbkqR0J6TJTlt95LYjlVhFNXi7sKiAgnv8Afff959+Ujo888TzRj7i6snmSdG8cZz7Z8p6smqjS/wAnWC35Xj0nznzmPu+Z7S1RP3fd83KObm5d9+m2+dAqd9spn0uXBXtFXMrXqiykn0PcYxnOSMYxgDGMYAxjGAeOdspycqTlMw2y0SGert7cnKw9mSgm/hkJUjJe5CKnGS4BkzKFhjGMAYxjAGMYwBjGMAkiHr92ThjGTZFDGMZBIy08X6RHcr2YJVDpNDJE6nqGV1Ksp/EEjLtjJjJxdrZroD5/9mGpScP63AJiVWC6+m3CdgHr2GEQmb3J1r2PwQfhnXflE8XfkzSr0yNyzSxipUI25hNa/NK6g+JjVnl/CI5z15afB4gtwXEXaO2nm0/sHfRKzxHp++aLvBv7q65hva92nSatU0CBi7NTqM97cf222paqj7Dxbuo3k6ey6B4ggfsmXhi8Q5NBxBK0/dz/APBct/RyUo/JxOFT9lzR/Iy/yL+DxYtz23XeKmnm8O/Ud9KoaQj3MkPKPwsHM87fvKDmpWHo6QkTzxOIrVmWMzhJW2Aq1IFIEk4LKCzcwDbqFJ3I2J2BcJnStLroygTtEZ7Hv76b85IN/aFLcg+5BnFPDN+4l8TQQxWr3nVqURTqZEedjM05Kd6hdl3lb7XQrv7M5NFHT8d4pq9bmSnDBGscJPlg65qcn2j7rk72XNbuqcyvHBRXfqbKrduPFmnPE95jYidhtFfoVoI5AOrIk1OKNo5Su+25bbbflOxGdP8ADHalQtaYdULmCtHFI9pH9aWtJD0lrkL/AGyTm2C7D1+dCPtDOXeO9c4n1WBq9rSKixkxtzVqjRTKY3V1KO11gDuu3gdwSPbmPWdO1Kjo2oQ2IJq0M+sUJiJCuzgQyBjsrHp3sFTx9wydZwXQ8RxYub2GHP7WMWsE4tShJpOor+kr8n08nSRnKLfVqu5l3FnlG67fmZNKTzCLc9zHBVjvXmQHbnnaWOSNd9xuEQBd9uZvE3Tsp7e9fFuCpqEJ1HvW5ZA9VKV+upPWx+bWOIwoASQ6Dfbo4PQ5D5E+j1Hq35mVJLHnpilDAFlVYYmhUdP7Xs7sPvd83seGqne98IYxJylAwReblJDFebbfl3AO33Z5fG+J8O0eXLw+Gix8sE4qb2nzV8TlXM1f97fre9FoRk0pcx89+Pdcu6hPJPqQZbUkUaTBoDVIVVIX80VHKNieu3XOmvJt7TNZt2LKatzRUoNPeZZJKBpxxtHJFuzTsgDARd4dt/AE+zNLeVdEE1W+FAUea1yAPDrE2dJeUFD3GgXzXQK7168chUbEwyTwJZ329nctLnveINVh1Oh0OD2MF+JSjF/2FvH8G3Tf02RniTUpO+n36mq+0XymNRszNFocYrwczJDO1bzq/a2/52OvIrJAh2JCMjttsTyndRZNE8oXiTT5VGoAXkJ5mhvU46EzINt+4nrQxhfEesySDr4Ze/Ik0irPNqjShXsRR1gisOqwyGbmZN/YXQBtv8RN/ZvsXyweH6g0uaYokcsU9XzcgDm7x7EcZVfxjeQH7tz7M5M+Xhek4lHg/wCDhKDcIPI98jc0veTrmrfemu9VsiyU5Q5+b+Rce0btd5tCfVNIkCSM9WNe/jR3rSNdgr2YJoiSveKGdfEg7hgSCCdK0/Ka1dKtlZGrTXZJ0FeY1kjiqwLHvK5iTYSzM5AXm6DZid9gGxHhaeT8gcSJ17r8o6TIvu71rFRZdvv5Eg/kzL/JB4Er6hPcnsosq1e5jhVhuFeQO8kmx6c3KEAPiN29+dMOD8L4XotVk1GJZVhz+7dczTjicYOXlcve7PfbsRzznJJOrX8y9T+UbfraTpfrRW9Vti9LNasRRrFDBDqFmvC/m9YRo0rLHyrsAo7liQxOxsCdtPGNP8/Z3mgPr8tvTIBXVftdXpJHLENum7tv0zPPKj7HrFtoLOnoshirmvLATymSNXaSPu2PqhwzyDZtgQ/iNtjqfh7tR1zRzHDdhNuBOVRX1SAh+VR1WvfRedjtsN2MqjYdPHK8LxaDW6X2miwaeeWcpyyYcjSmk5NqOJ1UUlSTUVGvJ2JuUXUm67P+Z2rwFq81ypTnsQCpPNAkk0AcyCFnG5TnKqSRuNxt0O43O25wLyuv7xax/wCxj/8AHhzMuy7iyvqtKrarK0ccqsDG+3PDJE7RTRNt0JWRGG46EAEdDmG+V1/eLWP/AGMf/jw5+X48cseuUJR5GsiTj/Val03t7dN2zru4/Q5J8lLQNDuPqg1qbToFRaRqnULMFcMWNvvhEZnXm+zFvtvtuub7PZ5wH8dw5/GlD/b5yz2K9lFjiFri15khNUVy/PGZObzjv9ttmG23cH9ebNXyRdSP+Fwj7/Nn6f8A9mfT8TWm/ES59RKD2uKUqWy8tvUyhddLOkuLOJtJ4N0tXijV4mYjT6tZlHnc0/NKOSTqqQkFpGk6gL4BiVVuX9Q8ozjHUXkk08NXhRmHdadpUdxEHiqyzWYZmMgXb2qDv9kdAKvy4op4ZOHa8p5kraW8aHqFMoaGOYgHw3WKD+TOlvJm4epDRtGaKOJxLQgklKgHeZ0Btbkfv+/70H7wR7M86Cw6TSR1M4LLPI3vLdLr99v1RbeUq6UaZ7EPKtstYiqcQJCqySCAXo4/NnryluTa9B9gR83QuoTk9oI3K3byt+23WNEv1a+nTQRQvp6WHEtaKcmRrFiMsGkG4HLGg2+7781P5cui1KuqxebhUkmopLaVAB172WOKR9h/bCicvX2Rr+mxeU5Zmkbhl7HMZ24W0wzlvtGQmYyFv+tzb7/fnoafQabNlw51BKORSuD3VpdV+vIo5NJryNgzeVDq9y7odepJDWgM2k178i14Xm1CWVoEvn88jLXrmRpFURgNsvNzesFXtyNtwD7wDmkPJY7NKVTTdNn7pTYtVYLk0pUd4zWI1mCs22/KgcKB4Dl95JO8QM+b4nlwSycmGHKoWr7y36v9fkbQTrc+f3l1dNa/+WVf/Gt5ReTD2hy8O6ka9wtFTtvHXuq52WvKetW4PZy+uAzDYGOXm68q5W+Xb/fr/wCWVf8AxreZn5UXZP3lGjqlRN5IaUAvKv8AzldYg3e7DoXi3JJ9qFuvqKM+px5sX4TBgy/Dli1fk1VP83+dHO0+ZtdjW3lJdoUvEmpFKnNJTpiaGiqkckojBe3fJ/xXEW4P+TiToCSDe/IPP/HE/wDme1/rdDMy7A+ybzTSNW1G3GVs2dMtebo4AavX7iRk+8PJsrn3AIOhDZhvkG/34m/zNa/1qhkZc+KWjz4cPwYkop+b3t/n/MlJ8yb7mMan/wAqh/8AGEH/AN1jz6Qw+C/gP2Z83tT/AOVQ/wDjCD/7rHn0hh8F/AfszyPEPTB/6a/gaYe/zIsYxnzZsMYxgDGMYAxjGAMlzr7f15MzxhkoEqE/y5OyBYx+OR4ZCGMYyCRjGMAYxjAGMYwBjGMAYxjAGMYwBjGMA135QvB/5U065EihplTvq2/T89D+ciG/sBZQp+5jnKHYt2Y3bOo0RaqWIK8MoszGaMopMBDxRb79WMvdkj2qrZ3kw38euSoakandUVT7wAM+l4T4p1XDtHl0mJLlyXu7uDa5W479ar6oynhUmm+x6kAChfZy8v8AJtnH/bn2OX6dyS9pSu6vP51ywMEnrTludpYd9g6M27Fd9wWPRgdh2LkMsYboQCPvG+cPBuNajheZ5cNO1yyjJXGS8mtv16NotkxqapnDeocV8YainmrLcVSBHIYqMNCSYHoe8tCNOUH292VB3I6jpnQun8DWr2irT1eRbNp4iGmVArqVbngbnI/PTRkJ+dKjmKAkeJO149OiU7iNAf8A1Rk6ZOn4Z3cR8SS1KhHDhx4FCXOnjjUubzv+C8ld0ikcVXbb+ZwUvDvEfDs8rU/Oo2Yd201KJbEVhFYlO9qyI4LDckcyHl525WO5Jyzsuh4qv6lSuT2LkZhISWW4ixxNWdlaeqlCMIhEnKu/qpsVVubmRc66nqRv9pFb8QDkcNFU+yij/wBUD+bPV1PjbNnxyU8GF5ZR5Hl5Lk01Xf0/4fQyWBJ7N/I5N8rHs8vS3jbrwPahnrxRyiIBnjeLnHVN92RkZfs77FTvt0zIOwSDXdT8+r60bk2nvpxqwx21ijXmZlBHIirI7d2GHO4J6nr1OdJTQK32lB/EZVVayoPVUL+Azgy+Ks8+Hx0U8cHyKoZKfPBJp+672eyVrt18zRYlzc32OGNf7ONd4ftGXTzaYIXEFqpytN3bbbxWYCNpd9huOV0YoDsp2Ap9RpcU8RSQJc89sKjfmzbhipVoCRymUwQxRh323HMEZtiRuATneE9dH+0qt+IByCClGn2UVfwAGehDx9qqU54cUs0VSzOHvr79flS9CPwy83Xkc58d9lT0OHZ6lSN7VmSepPNyKBJPItytJNIQTsAI49gN+ixqNztlT5FvD1ummqedV5axkmgaPvQo5wsRViOUnwOdEyRhhsQCPcRkMMKr9lQv4DbPFn4k1OTQ5dHkqSy5Paym75ub3fWq91di/skpKS7bHNvlL6dxJ59Bb02aYV68HdV4qj93JE0hBsNNHIe6tq5WPo24AiUcm4LHUXFFjibWxDBZqO/JKrgrRip8zhWQSTS+qpADt0TYdfA9M7xkjDeIB/Eb5IjoRKdxGgPvCjO3h/i6ekxQh+HwyliVQm4e8vm01fV+W+73KywJt7vfqYT2A8IPpOnVa0jBpB3kspG/L3k8jzSBdwDyBnKjfrso9uUPlP6VPb0bVIa8bzzSRIscabczkTRMQNzt4An9GbNyF0B6EAj3HPnMmsyZNQ9TPeblzv1bdv7mvKkqOUvIO4Qv6fJrRu1ZqomSgITKF/OGI3e8C8pPh3if94Z1fkuGBV+yoX8BtkzI1urlqszyySTddOmyr+BMVSo035UPZINfqp3ZWO3WLSVZGXcDmAEkL7et3ThV326gop2PLynk3QbPGvDokrUxqUETSMeSCrFqNbmJ2MkQeKVYObx3AQnpv1z6LZTTUIm6tGhP3qM69HxaeDH7KUYzh15ZK6foVlC3fQ4R7Lew7WNcu+d6336xvMs9prT89q4V5eWIgb9zDsoU77EKoVQBsVyDyzez3UbWoUjSpzWIY9MigLxBOQOtiyxTqw2IVkP/AGhnakMKr0VQo+4bZDLXRvtKrH7wDmv7dz+3jmpe6mox/opP6kezVUYv2N05INM0aOVGili0yjFKjbc0bx1okdG26bhgR092ZbniqB4dBnuePOXNJyffc0OIvLK4E1O9q3e1KU9mI0K8QeMLy86y2WZPWYdQHX9edgcL0A1OpHMgO1WFHRwD1EaqykH9Iy8SVkY7lVJ95AOTANs69Rrp5sWPFJKsdpebuuv5FVCm35mM9ounk6dqUUKFmajajiRB1ZmgdVUD3kkD9Ocm+RhwLqdHVJpbdOetF+S7EAeQLymRrFN1T1WPUrG5/wCznbDDfx65LirIvVVUH7gBk4NfPDhnhSVT6vvt5bhxtp+R87e1HgDWxq2p2a1K4P8Ajae3Uni5AQVsGWCaM824IIVgfwytGrcffEa/8wP/AMs+gb04z1KIT96jPPMYv8mn/dGej/tBkcYxljxy5UkrTfT6lPZepz55HlziF31b8uPqEqlaXmZvSd5ylTc847rZjy780G/v2X3Z0XkuGBV+yqr+A2yZnj6nP7fI8nKo32SpLatjSKpUMZ4rA77ezxz3MCRjGMAYxjAGMYwBjGMAYxjAGMhkbYe/JMcvXr7f5MlIizFtd4pnh1XRqKrCYLtDVrU7MrmdXoNREIiYOEVD50/MGVieVdiOu9T2tcRS6bpurXIFieanQsWoVnV2hZ4Yy6iRY3Vim46gMD9+Yn2hWVg17hGSU8kU9XiDTkkbpGLM8en2K8DOfVWSRa0wQH7RTYdcq/KeupFoeuhz609GSnAoBZ5rFzavXhiRfWeRpJFAAB9vuOQSZE/HenQxF7eoabVaKKm1xZrleHzV7sJlrpMkknNCZVSQoG6sEO2+xy4VOKKEtc247tGSiFd2uJagamqxsVkZrIfulCsCDuehBB2zUnZjw7A/EHFLWIYppqencKxQGVRIITNp9pLDRq26q7CBF5wN9gQDsxBxu/wE1rUOMKWnyVqawajwjxDUqWIA+ky3e4uSWYbVaHlZqtg0oWkC7nmQMPDbAN5adx9pE8U88OqaVNXrgGzPHqFR4KwJABsSiTlh3JG3ORvuPfl61bUoK0ck1maGrBGvNLNYlSCGJdwOaSWQhEXcgbk+3OaO2vWXahxLW1HRaemau2gPYhu6e8NuvqNCrdpLYCWhDHZhEc0yHuJl6D1gTmy/Kwuxjh7XWLoEkpRrG3MOVzNPAsQQ/vixZdtvfgGd6vxfp1ZbDWb+n1lrSRw2mnuV4hWlmjEsMM/O47qV42VlRtiwIIBBys4f1urdjWanZrXYGLBJqk8diFip2ZRJExUsD0I33BHXNO9m+jULHEXGkk4js3K1nSTWinAkWpFPo1RJbVWNxypNKUEbyAcwWFF3AYhsB8oEfk65r8Olha1a7oGmWOII67+aVonn4hqUTZkljidak1ihNeSR1Qnu0MpUkbkDo3RuOtKtTPXq6lplq0nPz1696tNYHd/2z8zHIX9Xb1unq+3bI9T410uuksk+o6bBFDaajPJNerRxw2kUO9SV2kAjtBWVjEdmAIJG2al494Q1e3Shr1dF4c0mSpJVm0i5W16Utpk9eWN4JK6DQkBVuXuynMOdZCPbjsQ4JpW7vF1u3BFdli4r1KtVS1GliCnvW0+SxNWilUrHYm54kaQDmK1oxuBuCBvKhcjmSKWGSOeGWNJYZYXWSKWORQ8ckUiErJGykEMCQQQcxjR+JVs3N6+p6Na086W06wVpo57xlS7JXe930UxjOnDuZoT6vSWJxzeqVzUEGtz6NR1rQqzMNQh1GHSuGjI0pZqfEkksmmTiVwxk8yT8ohj1CrpfXYbZ7q/Z3DJqF/R68zUoT2cafpkMyqGZANXvxK8i7jvecj1xuC/eMNwTvgG6+GuOdKvu0dHUtMvyKCzR071WzIqjxcpDIW5P+t4Z5Nx3pK2PNG1PTFu86x+aNerC33jAFYvNzJz96QQQm25BHTNP8Va3qGmxhL+h6THqK6bqsPDWraKsc9db8WkXZ0qx1bEKWtNaSGu4CoZEb7G+3XKXg3hu1c0OrVj0Hh63Ru6ZFI9mXiGfzizJYgEj6lOx0Est8ysZS3NzI46Eco2A3rxRxPR09Fkv3Kenxu/do9yzDWR32LciGZhzvsCdhudgTmK1+NDPq2mwVbFezp1rQdR1ESV2injmlrXqFeN4rMZIZAs8oIU7b/hmB9imkSTapqS64YL2qaVpHDtSk0m88awTafzX9QptMg7ySe6JlkmChh3ar0B2at07RqFXi1RTCQSTcL3rV6tAAkCTSalp6Lb7lByx2Z0jAcjYsK8bEbndgM746XiHvObS5uH46qwAsup1NRnsmUFzIVepajQRcvd7Dl335uvhmI9j/EvE2p19Lv2ZeGIqFqJLNiKKnqkduOE83OEmlvNEsgC/aZSM27d+xJ/7N/8A6TmrfJ5/5NaP/mZv/plwDM34s07mqqL+nk24Xs0gLtYm3BHG00liuA/56BY0djIu6gITvkPDHH2k3n7mnqml3pwrN3NTUKtmUou3M4jhkLMg3G5A2G4zSHZppOn6Rw3Q1CahDrN67RoVz50sLy2vymYtNp6b5xZVlq6csU8ULKByciyMVZnbmrO2HR9RrwaVYsVOGqr1tc0M1pNNa3DcptLqVWF4arvAFtxvG7o6HulKczcp5QuWbtFUqNvcO68Abgu6jo82+tT0KIqypE0R7tZYNLsiSZu81ZYw7si7ErsQgGVvDfG2l33kipajp16WJS0sdO7XsyIoYKXZIXJCcxA5vDcgZzdrOlRW3mgnXnhm7Y2jnj32WaM8PfnIZP8AGikUFGX2q7D25tDtp0atWt8F2K8EFewnE9bT0lgiSJxVt6bqa2K26AfmmEaDl8Bt08TlSxmXC/E8axRef6no0080+oiB6liKKGaOi8rTxxLJKTJLWijPfcpPIY3JCgdLlwtxjpuo96KF+hqBh5e+FK3BZMXPzBDIIXJRW5W2J6HlO3gc5r4Y0qC3NwLDZjSeB+JO0FpIpBzRydxPqdiNZE8JE7yJCVO4PLsQQSMzbt50ySrqWgT6TGlfUbOm8V6eprxxobZj0Ge7p8M/Ku8gju14HUb9CT78A2xLx3pK2PNG1PTFu86x+aNerC33jbFYvNzJz96QQQm253HTLvp2pwToZIZoJ4g8sbSQyxyxh4HaOZC6EqHR0dWG+6lSDsRmo+yGtw42g1C66Y1NqEbaw13zcubfdKdQbU3l6i4Ju85ix3BA5egXK3ySwn5Gh5O+aP8AKOuCPzrn84Kfli9y+cd76/fcu3Nzdd99+uAWvgftOOrancWHXtCrU62qHT6WmoKlm7rMUVOGWW3FZNwOyPNNIEaGMrtXYHmIJzZS8e6QbHmg1TSzd5+6808/q+dd5/ku47zn73/qbc33ZoXUVerpXaXPRhjS1X4i1VYJYY1SepCaOjpZlryIA0LxV5LEilSNiu4I8c23a4C4e/JBrGtQXRxpxkEoWLkWuIO9/KAtbb99y/nvON+Yt6++/XAMw1HiGnAZxPbpwNXgW1ZE1mGJq9d2ZFszh3BirlkdRI2ykoRv0OWnjrjivp+m29TG1yvDS89h7iRSltZFU1u6mUFe7kLx7SAEAPvsc0B2L6CdavaTLrANxU4G0K1NXtbyxXZ2v6olG3cRyVtEVy8m0gYF5w/2lBzoziunp4qTQXhUh06SFaUqTulaqI5ytWKANuqxczSRxpykEMyhdjtgGDyTcZRos/d8M3Dt3kulxC/Um5diRXr6tLO8LWB6o55IVQnf7I2zOuJuK6OnpG+oXKOnLIeVDctQ11dwAWSJpmXvCN/YPDr0zVnE3B+q8P1LdnSddtzVtPq2bn5M19INQqyQVo3mNSLUNo7dVFRSqEyMByqD06jHOzjW72palrd6HSdOu2GraGK6arqz0rWl6fd0itfhrQwppljljknsWmeQMvPIjLt+bwDfY4lo93Ul88pdzcligozedwdzdmm5u5hqS8/LYlfkflRCSeQ7A7HKLQ+O9KtySxVdT0y3NCjyTRVr1aaSJIyBI7pHISqKSAW8AT1znDj3gm1HJTju16FLTtS494bkj03T78lyGCWWjqUWrqXNSsYYrKCu5jVfGWU7gMM2T2+aPWqycHWK1evWni4s0jT45III4mFTUFsVLdT1FH7neNgCnh6o/SBeuyztk0/VZdRhNvSopoNZt6dp8UepVpZtTrQRQSRXq8XMGlSTvJR+bDL+ZbYnY7ZY3HWlCz5mdS0wXucR+aG9W877w9RF5v3ned7sd+Tbfb2ZpXgyFq+n9pU9OGNblXiDih6DxwRtNBJFpVN4TXHKSGVuqqPE+w75M0Phe1a0SCnDoPDstGzpsbpaPEUxld5oBINUaT8g/wB2943fGTm3D+3pgG7Na4w02p5x5zqGn1fNu4FoWLleE1zaDtWE4kcGIyiOQoG25+Rtt9jlZoWt1bkSz1LNa5Xbm5J6s8c8DchKuBLExUlSCCN+hBBzSHY/w0W1jWm1RYL2o0tD4RglnYCeM2ZaF5LtmLnQDvHaDpJyhgsjgbB2BwvtJrJVu8SU4v3HpF7XOBvy2kDebQQVdWF+PVZgYmTzdLL1KUUrqRuJWJ33IM0DpDQuPNJuStBU1PTLlhQxMFa/Vnn2T7ZEUchYhfaQOntz3V+OtKrTrWsanpla05QLWnvVorBMm3djuZJA+77jlG3rb9N8ouIeD9M7mvtp+n8+ntHPpSebQxCtZg2NNa3IF7s94IwFXoeg2PTNO9iFC5a0atyaFw/qkGpwy2dSs2tfmFnUrVl3N6a8o0SQraE/eqV7xjGYwob1BkA6G1oTmKfzUwLZ7qTzY2lkesJuU90Z0iZXaLm23CkEjfYjNM61xJxfXvaVRaXhFpNRg1KeORdP1nkiGmrWZ1cHUNyX85Xbbw5DmwOxTSbtPTNMr6hLFYtQQNC0sEzWI3hSWRaRE7ohmcVRXDPyjdgx2zHuPP8AlDwd/wC4cU/+FpWAZXwxqdmFFj1m1owvOLE8aUO9qxvVriPvJVguzvKwjL+u4PKOdN9t+sFrtI0SOOCaTV9Hjgn7w1pX1KmsVgRStDIYHaXaZVkR1JXcAoR7MwDte4Zi1LXeF4LHM1X8m69Nag3Ijuxwy6SyVrAUjnrmbuXZD0cQ8rAqxBqEkae/qNTR9H0CuukQ06Vm7qEKqWNqJtRjoUatOHnWrGLJdnd0XnsPyo3ViBs1OI6RjrTC5SMFqWKGnOLUBgtSztyQRVpQ/JPK7AqqoSWI2G+W7SuNtLuSyQU9S023ZjDNLXq3a086qhCyFoonLbKxAJ26EgHxzmTWNMJbU6NiGnXg/wCEThBZKumTSmlEdQr1Db82Zo43hMn22UKOR2cAnlDHbXbjo1Sv/YrPWrwU56vFWhVK8laGKBo612x5naqKYlG1aSBypjHT1V6dBkohmcScd6VFY81fUtMjucyoaj36qWuZgGWMwNJziQgghdtyCDt1zXmpdqXner26VbX9C0itp/5NQiwKlufWLVqWfzqnA8tyMRmJYUhKRhnWScb+xcs+paLY0GtfW7pelcS6D51e1KzYURLq8UNqxLdmnvU7sZg1N4jJsJI5UYpGp5RygZMqKF1DtHmpwQTWYtE0G3pgNdH3tfknVZ4GRGUeu8qxk+BJ8cmW5EdjcOq8faRWmFaxqml1rRKqK89+rFY5pNu7UwvIGDNuNgRudxtvl3uavWicRy2K8UphlsLHJNGkrQwcvfziNmDGGPnTmfbZeYbkb5rvsY4R0ibRdPC16l6C/Qhs35p4o55NQsWYle7YuyyAvLZMpkDFyWQry9OQAaZ7OtBi1W1wtXsSSXNNr1uL4qweRnTU9L07W6cOlw2HB/P1AY6x5T6rrVRWDKSDUsdIL2h6MYnnGq6Sa6BGecahUMKLJLNBEzSCTlCvLXnRTv6zQuBuVOR2+PdJjrw25NT0yOnM5jgtPerLVmcc28cUxk5JJByPuqkkcje45rzhbhGgeI9YbzSsBR0PQ0oxCGNa9XzmbVu9eGuB3ccpCFQ4AYLLKAQJHBo+yPhGimvcbla0AWvZ0jzWLu0MFVtR0mCzqMleIjlhksSLEZGUAt3CDwGAblpaxWlhFiKxWlqtGZlsxzxPWMa7lpROrGMxgA7tvsNjlv4W4z0zUTKtDUNP1BogDKtK5XstGGJCs6wuSqkggMeh2Ocz8TVBDNb06GvVl02XtEro2n2bTUdOmefQYNSr6fJIsEwiqS6gqyCARMrOipsA2bO13hzWbNzQbC6XoWkS0dRhL2a2tTWJ5dPdHj1DTlg/JEAmV4GZ1UuArwKffgGzK/GGmuaYTUNPkN4SnTxHcryG8Id++apyOfOFTlPMU3C7ddsk8P8AHWlXZGhp6npl2dQxaGreq2JgF+03dxSFuUe07bDNEeTt2V1L3DUbTd2L+p6XepJqEsa2JqFeV7NWvXqd5sYaoj2Z4Y2QSNLKSd35szHQbs+m2NCra1oujrvP+TtG1bR+7avDZNSwY4DSsRJZ00yVoZl3iaVNyVJA64BujGec2e4AykkTb+bIc9JzRRoo3ZbeK+GaWqQPWvwR24GKtyPzKysn2JYpYyJIJl3O0iMrDc7HrmOaL2Q6VXmgsOL9+Ws/eUzq2qX9TjpuPsyVobkzxxyjps/KWXYEEHM1U7ZOnG4B/wB+uQ1uSnsWTT+H60FnULcUbJZ1BaaXZDJIwlWhHJFVCxsxSLlWWT7IG/N13yya/wBmGl3pLktiu7zWnoSyzJatQTRyaZHPFSmqy15VepKiWZxzRleYSENuDtmX5ErkeBy1FbMU4e7LdMrG07JZ1Ca3UbT7M+r3beqzyU2JZ6Ie9I/d1GLEtGuwY7c2/Ku1lXsG0MxNBJFfs1e6MNetb1bU7VaijLyEUIZ7DLVfk3QSJ64VmUMAzA7JWc+3JyNvmbTRdM1LW7KUs3+IbF2OaDzjUKlnSrlC9NSvxxrpFOnZCWaUqTRxtJXIaJzyt3aMVJVSMx4W7PNNow24Yq4lS7zflF7skl+xqPOhiYXrFtnksryMy8jkqA7AAbnMqxkEmudN7FtIiMA21GevXeN6lC3q+p29KrNEQYTFp9iw0BCEAqrKwXYbDoNsu4a4arUTeNZGjN6/NqVveSSTvLM6RRyyDnY92pWCMcq7KOXoOpy8YwDHNS4IoT3qepSQK2oU4Ja1afdgUjmEgZWUHlfYTT8pIPL38m32jknibs90y/JZluVlsvZ0+LS7Akkl7t60FprsKBFcBJUsOZFlXZwQCCNhtlOMAwnhrsw0+pPDYB1G3PXWRabanquo6ktETJ3cvmcd2d0gZk9QuBzcpI32JBt1rsU0dzKAuoQVZneSxp1XVtSq6RO0rc0vPpsFhYAjk+siqFPtHU5sfGAYbxT2ZaZd8zZoZKc1KEVqNjS7M+l2qtcAKKsU1F0YVQBsIjuq9dgMj4V7NtMoTR2K0Mi2VhswNYls2bNiwLcsEtiS3NYkZ7U7NWgHeSFiqxKq7KAoy/GAQyICCD4EEH8CNjlp4a4Zq0asNKsjR1IYDWijMkjssZ5hy947FyfWPUnfLxjAMZk4D05tPXS5KyzaaKsdMVpXkcdzFy90O9Ld4JEKIyyBg6sisCCAcsP/AANaUwQWDqeo91NDPWOpaxqd803rzRzRtUFmwy125okBZAGZeZCSrMDsTGAYkvZzpobn7l+b8uHiPfv5/wC+RreaGxtz7cnc+r3f2Pbtvl24k4crXTRNhDIaV6LUqm0jx93ZhjmijkPIRzgJPKOVtwebw6DLvjAMS0zs502B9OkjhdX06zqdukTPOwim1jvvP3YM+0nP38uwbcLzertl11nhyvYn0+zIhezp7WpKLd5IqxvbrPVmLoh5ZAYnZfWB233HXLxjANFT8M6iZu/l4Q4Vs6uGV11aO7ClNrC7FL0kU1M3Y3VwGCjvH9QASb7EbK7JeE20mjWqyTm3MrWLFuwVCCezdszXLTqigBI++ncKP8ULv13zK8YBZtB4Yq1PP+5j2F+7Nfuh3eVZp7EUMMrcshIVDHBGvINl6eHU5hg7DNE6J3V40xMJxpZ1XUjookEvfj/is2PNzH3o5u6K937OXbpmzMYBaa3DlaO1LcSMrZlpwUHcO/Ia9aWaaGMRb8i8rzyncDf1tvYMqtb0qC3FLBahhtQSryTQzxrLFIu4Ozo4IPUA/cQD7MrMYBrCPsJ0XljjZNSmpxtE0enWNZ1axpS9wyvCpozWTFJErIpEbAr02II6Zf8AjHs30/UJYbEi2qlyGBq0VzTLtvTLorswc1mnpSI0tfmHMEfmCkkjbc75hjAMIp9lOkxpVRa8jGvqkGtLNLbtTW579dJEitW7cspmtsFkccsjMuxAAAAAv3FXDFXUPMxaRpPM9Qq6pV5ZJI+S1TZmryN3bDvFUsfUbdT7QcvOMAw+v2b6fHdlvxC5BYmlFizHBqF2KjZnEQgFixp6TCvLLyKo3KbErzEFtzlmfsR0b10VL8NN3eSTTK+q6lX0aQyMXkB0uGwK4iZjuY1UIfDl23GbJxgFk0ThSnVnuWK8Ihltw0YLHKzCIxadHLFTjjh37uFUSaQbIBvv18BlJc4K095NTkkgSZtUgq1dRSYtLDYhppMkEZhclECieXqoBJIPiBl+ZjhVJyyRWzXujdj2kwSVZOW/bFORZtPg1HVdR1ClQkjBWKSrUtzvEkiAnlYglehBBAOTL3Y9pU0lqRfyjTW3K09+vp2r6np9K9LIoWWWxUqWEiMjhRzMoUt1J3JJOenJ0I6ZL6BFPo+mw1YoYa8UdeCGNYoYolCRxIg2VEVegAGUeo8N1prNG3IjNZpR24qjiR1EaXliWyDGp5JOYQR9WB25em25y74yhYtVzh6vJZq23QmzVgs1q787hUittC06mMHkYsa8XUgkcvTbc5YOJuzKhbsPa5tQo25I4orM+lanf0uS3HBv3KW/MZkFjkDEBmHMBsN9gAM0xgGAUuyDSId+5rGIHUdO1YgWbJ5rulqq1bMjNIWllJXndnJMrszvzMzE3/iXheC6Ky2U7xa12rqMAWR05bNKTva8h5COdVcb8p3B9oOZBkieb2D9J/oyybKtI1m3Yro43QRXUqGTvX01NV1JdGdu873Z9LFjzcxc/rd1y937OXbpmXaVw3Xhs37USFbN9aaW37xysi6fHJFVCxk8kQVZpPsgb79d8u6KT4ZVxRgfznLNpFUrNdah2J6PK1jZL9eCy7yXKNPVdSp6XaaT+2mbT61hYdn/AHyqqhuu4O53yuvwjSjmpTxwLDJRpS6fSWHeKCvVmMBeBK6bRhf3NCB06BNht1y+4zM0LVT4erx2bVtEIs2oKtew/O5V46ZnNdRGTyJymxL1ABPN132GQaRw1Vr2NRtRIVsai1V7zmSRhK1OuKtcrGzFY9olC+qBvtudzl4xgGK6l2eaZOmrRT1VsRatMljUY5XlZZpY4YYI5EPNvXdUgh5WiKlWjDAhuuUPDvZZp9WaGwW1G9PX5vM21XVdQ1NaXOvIxqRXJ3jhfl9XnA5gNwCATvnGMAxjT+ANNioLpYrLJpohav5tO8kymNpDKVaSRi5IduYNvuCAQRsMtugdl2nVJobAOo2pK3OaS6jq2palDRMiGJmqQ3bDpE5QlOfYsASARud85yVM2SkCWTk6LfbrkpRvlRkyIRDIwHjlMzb5HYU+Ps/ZkUMXtP6slUlZD3Z5DF7T+gZPIxjKt2SkUbDbJ6qGHh+rpnllfb+g4rg/oyzdqyq6hoPcf15OAxjK2WoYxjIJGMYwBjGMAYxjAGMYwBjIJJAPxyJTvgHuMYwBjGMAYxjAGMYwBjGal8oXtjh0OLuoO7sanOhNeEnmSsh3UXLYU7iMEHlTcGQqQNgGZenSaTLqsqxYlcn+rfkl3ZEpKKtm2sZpbyb+2pNZRat1ki1WKPc/ZSPUY0HrWYFGwWYDq8Q8PtKOXcJunLa3RZdJleHMqkvv6rzTEZKStDGMZyEjGMYAxjGAQ8gz1jsM9zxhvgFOMqQMgWPY5HlmyEhjGMqSMYxgDJE8O/Ufp/pyfjJTohqyGNAP5/vyLGMgkYxjAGMYwBjGMA8Y5Tk5UON8pyMtEqwDg4GTkUfjlm6CI8lWvD9Iybkm34D8f5jlI9SZdCnVj06nx95yuyhj8R+I/blVNLt95/38ctJFYsikcDxyIZQM2/jlRUJ/R7D/ADZDjSJUrZPxjGVLDGMYAxjITIPDfrgEWMYwBjGeMdsA9OSJJvd+v+jIJHJ/oyHNFEo5HmVFcH9HsxCg94JybkSkSkMYxlCwxjGAMYxgDGM1T5QXbDBoUXdxd3Y1OeMmrXJJSBDuouXOUgrACCFTcNIykDYB3Tp0ejy6rKsOFc0pdF/F+SXdkSkoq2S/KG7Zq+gxCKIxT6pYjJqwMw5IFO6+eWgCCIQQeVOhkZSBsA7rw3q+pTWpZp7EslieeQyzyynmeRzsNyR0AAAAUABQoAAAAF5PHmql53OoXXaeVppxLL3sMkj7BnNWUGuDsqqAEACqqgAAAS14pc83fU9GtFiSWk0qtVkO/j+d0kVpCSdzzFt9z45+w8F4J+y8dKKlN/FO936JOKSivnv1fpwZMnOyzUrUkLxyRSSQyxSLLDLExSSKRDukkbr1Vgfbnbfk49tUetIK1wpDqsUe7AbJHqEaD1rNdfBZQOrxDw+0vq/Z49Op6c4Ak0ySE9OZtO1axF+PJFqkVwfoLj8R45FxHpr6dPXlqWJHhkC3tIvRfmpHjVyob1T+ZuwSK0Usfirodxysu9eM8MxcSgseSLhPfkk67dVs3t6Pera6MY8jiz6T4zTXk59tMetIK9sxwarFHu6jZI78aD1rNYexx4vEPs77jdT03Ln5DrNHl0mV4sqqS+/qvNM74yUlaGMYzlJGMYwBjGMAYxjAGMYwBjGMAYxjAGUhmO5I/V7MnWn2H4/7nKeJNz+3LxW1lJPeirjbcA+GRZCzgeJA/wB/dniSg+GVosR4xjIJGMhkfbCODgEWeMu+e4wCQ6bZCDtlTkt4vdllLzKtEzJNzwH4/wAxyarA5JueA/H+bIXUmXQpwc8xnqNtmpkToYPf+r+nKkZTLZPtA/Zk2KYH35m7NFRMxjGVLDGMYAygc7k/ecq7DbA/q/XlHl4FJlTUHQ/yZPyWpCgb+7JT2Pd/LkVbJukVOeMN8lVpN99/HJ2Q9iU7KMjPMnWF9uQRNsf5Dml7FKIMjEhHtye0YPs/myBoPcf15HMiaZMiJI65FnijbPczLjGMYAxjGAar8oLtgg0GEJEEsanPGTUrsSUhXcr55c5SCtcMCFQENKylVIAd0411rjeS7I0t+npmoTyMrT2JIrlS1MVVUBeXTbcCk8ihRupUbDpsAM3p5TnZDXge3qnNrE8c8hlvGBq100iQqrJ3FkxyGkAAP7ftEAFAVAOXn86TQYAxaqiMf3moaZfqkfi2n+eoD+nb78/W/C2l0GPSLJi5pTl8U1GVp94rlvlS+e/XyrhzSk5UyI3NKkI56Oo1Bt1NPVYZ1/0F6iXP+nGFoaY/Nyahcrf4q3tI5h+HfadcnZvx7pfwweEpmKiCxpNwtvsK+r0EkO3XYV70sM7HbfoEPhlfHwKyGvHctQabbtkeZVZ4bErSB5nrQy2bFdWjpQyTo6Kx59wpYhV2Y/RTy4Y/Dka9E+d/lJTl83WxkUun8Gy2CFp29M1CVg5jr1rM8NuXu1Z3WGrqVavJK4VGIVASdugPTf3hDUopY3o3JFiqzyd9Usyb8ul3ioRLLHxFGZQsU6/4vJINjF1sAMsEg2L154JtwVIWWCevJuCrLvyyJIniPArmW8awJfi/KddERmlSHXa8S8q1bs2/d340H2KN0hmHsSYSJueZcpnTtRm7jL4Zd4y7em/Z1V+675kgjGrIs6bYAcy6fdqzcyHm7uaGWJioeJvB13B2Zd1YH98D17W8nPtoi1uMV7Rjg1SGPeRBssd2NehtVhv0PhzxeKk7jdSNuNdP4v1GEBY799YwOVYntTTVgBsABWmZodgAB9nN7eS7wXd1OVL9yGjHThkL17CaZSo37c6Hbeta0+KGQQIwIeVi3Md0G/rlPnfE+lxZNI56jlTj8M03d+SVbp+XNt17GuFtS2Ot8Yxn5OdwxjGAMYxgDGMYAxjIecb7e3AIsYxgDGMYBRTPuT+oZCDk+aD2j9X9GSAds1T2Mn1CqT4An8Mn14yDv4eORwzA+PQ/yH8Mm5VyZZRQzxjtnuU0r7/hlUrLN0Qu2+RQKd/d7/6MhjXfKpdh7svJ1sVS7nuM85h92e5mXGMYwCm8PuyYr79D/wCWSycAZo0VsmNAv4fgcgat7j+sZUDGU5mTyopDA33H8P8AzyOqhBO426f7/syoyVNJt0Hj+zJtvYiktybjJUUu/j4/tybkNUWTGMYyAU9tvAfp/m/pynydaQ77+z9mSc1j0MpdQcjSMn2YicDxG/7cq0cHwyJOiUrIIYdvb/Rk3GMo3ZdKjx13BykOVmQGMb7/AP6yYuiGrELbj+TI8YypYYxjAGMZgfbD2jRaRCwjalLqMkZalUt6hSoLJ63J38slyeMCBW36A8zleVfaV30+nyajIseNXJ9P+r6JebeyIbSVst/bz2tV9Bg2HJY1CdG8yq79PavnVnlO6VVYH3FyOVf3zLzf2O+UDeoWpm1Sea/StzGW2X9aWnI2wNiqg6LAFChq6gLyqCgBBD4PxPw7rt2aWzbq3Lk1h+eWzGI7FYeAXms1natUrIuwHM6oir4gAnLfHcr0P7nMN68D/dZQSUKLD+D4pRtdshuvnUq92uw7tGPLNn63w3w3ocOkeCSWbJNe9JNbPtyy35Yp9+r8n8K4Z5pOVrY+jdaaKxGjKUmhmiV1OwaOWKVNwdj0ZGVvb7DnHHlL9iB0syXdOjZtNZi1iFQWOmMx+0vtNEk9D/zZ6H1duWy9gvbZZ0aZktvPd0+xM0loSO89ivLK28lyBnJaRmYlpIyTz9WHrb83bmm3q9yFJIXhtVrEXMjqVlhmjkGx+5lI3BB+8HPkJ4tZ4Z1akvexy/yzXk/KS7fxTaN7jlj6nzd0PhW7cVnrVZbCK/dFlMSh5OUP3MIldTZn5WU91EHfZ16esN6jRuKJKy1l81oTyU2c0JrUFg2KJaZrBREisRwzolh5ZVSzHMEeVyAASM395RPZNS0+GpYSPURp1Oew/daakEhrLbnFho3eeVGpQifotpBLyKyqU9RHOitU4prWprM1nSaRexPLYc07uqVJOeaRpHJL2Ja5O7eKwqCdzt16fe6LiUeI4vaRjzQt7Kk4vpTuaTbi96pJOt7OWUXF0zGJHLFmYszMzO7MSzMzEszMx6sxJJJ+/Lpwtr8tKSR0WKZJoJaluvYDtWt1512kgmVGVttwrqysGV41IPTY1AbSXB3XWqjHw5ZNP1WNfv5WjosR93N+nNr9hfYVDrBitPbmk0tJWWVXpSULNx4yOavEwnkQQ826vNG5I5WRSG3aPbX6/T6fC5ahOMelNN36Jq1fluIxbdIqewPsjqa8fOZ9Puafp8TgKyao0sOpSRuBJBHDPTNjzcbOrzLYGx9Vd2DmPr9FhqxADuatevDsPsQwV4YE/QkUKIv3ABc9rxQ1olVRDWrwRBEVQkMEEUS7KoA2WOJVAHsAAzkfypePtWvvLUhoarT0qGTaSSWhci/KTxnfvZJGj5VphgCse/UgM/XlVPzHmz8d1fLfJjXS5N8q+ruUn+qR2bYl6ld269rljUniTRNV06rTgkSbm/KA0y9bniYMpc6mkMS1kdQVQOyvsGYkEKN4dh3Hc+p11F6Ba16JQJjDLBYqWh4CzWnqyyRbNtuY+bdT4brsx+e8cynwZW/Bgf2ZWaRdlrSwz13avYhcSQzRbLLGy+BVtuo8QVO4YEgggkZ9brPC2GemWDHUXHeMnH3r/vNNWn32ddvIwjld2fUDGak8nntjh12Lup+7r6nBGDYhU8sdlBspt1Ax3Me5HMnUxlgDuCrNtvPzLVaXJpsjxZVUl+rXp5HZGSatDGMZzkjGM85x7x+vAIZgdumUQy4ZS2Y9uvsP7cvFlJLuT4X3H3+3I8oon2P7crQciSomLsYxjKlhkuWIH7j78mYwCjEJ32/l9mVgGMZLdkJUSrJ8P5cp8rCN8ppE2/my0WVkiDGRRke0b5U92PcMs5UQlZSZWL7M87se7IspJ2XSoYxjKkkiX7sihXICpyeo2yz6EI9xjGVJGSpot/Dx/bkbSAe3ITMPvyVZDopzk2KX2H9f9OQyuD7Ml5pVlOhW4yRA/syfmbVF07GS5IQfu/D+jJmMiyaKOSIj7/vGSxlwyAxDffbLqZRwPYwdhv4+3IsYyhcYxjAGMYwBjGa57c+1atoMG7cti7MrCjU5tjIR0M85HWOqh8W8SfVXqenRpdLl1WWOHDHmlLZJfrp5vsiJSSVsqu2ztC/IlVpUrzXbMnMlWGOKV4g4HWa1LGpENdNwTuQW8F9pXgHiLiGfUJ57NqfzqzO3NLISPZ0WNEHSOFR6qoOgAy56lxHqlua3eae+87EPbs1msxJCAD3aNJXIWtAqnZUJAA/SSbjfUmXlltvbX3ajDV1QHrv1/KcMxIz9q8P+H/2VjfKoTyS+KVtV/dXuy2/K+r7Jeflyub9DHI6wZlCoGdjyqFTmdiegVQo3Yn3DI2UgkEFSpKsCCGVlOzKwPVWB6bHM44a4qVhcj207RrM1ZYaupU6ktJom85gkmgmOnhvN4p4o3QzRRAr4H1Hcii4w1GvNLpyzWZtQNeotfUb1YOZrZW1ZlVa7XlRpzDXkhgWeZQW7rwKom/srVZHk5JQaXnu+130qr2q+a+1GdFk4e0SxckMdePnZUMsrsyxQVol6vYt2JCI61dQDu7kD2DckA7i7He12tw4wqq9vVqck3PenR+6qV5G2DyaRTkh76aIdeZ5Hj70gssa9ObVvEfE5mTzetEun6crh0qROXM7p9mzqNlgHv2/DZn2RNgERAOtv4Z0lrk9aujLG08nJzuGKRqqtJLKyr6zBI0duUdTy7Dqc4ddpIazDJatVjq+Xuq/pSa7ryWy7uXa0W09j6T6ZerXoEkheG3Vsw7oy8skM8UgIIIPRlI3UqfvBHiM5I7efJ+ai81rT5asWmsQzxW53h8wd227tZmQx+aFiOVpGXl5gpJ9UmR2A8dSaWJH04atq+l+dV4tUovp4Nykbcdh4dXpLTmmTum81lWSNivVU3JLqw7AqWILkIZeSxXsREbOm6SRyAq8ckUg3HQsrI4BHUEAgjPy+T1Ph/VN43zY5bbqrXWpJ9JpPZ+TtWmzsXLlW/U4w7CuwexqkqTXu6j0uM8zSVbtS1+USrlTWrz0JpFjjDKRJJzBl+yoDEtH2cBWowf4PSqVYP+pBWqwQJ9+yRRIi/cABkKirQg/5ilUqwf8AUgr1oIU/QscSqPwG2cZ9u3bzb1OYR6bPZoafBITE0Mklazfcbjv7HKQyVtt+SufEHmkBYqsV3+N8R6n+rjh9IxX3ub+v5Ie7iXqSPKK7bH1tmrUmaLSUceIKyakyHdZ7CnqlYEApCevQO45uVY8Yg1Ce/XaWvNYg1bTq6m01exLBNq2mV1CJaLQsGlv0kCo5PrPBytuxiIywWeLrUu/filbJRk5rOl6ZJMOYbcy2hWFhXHiCH8ctejalNVlgnruYp4JFlhkHXlZf8YeDowJVlPRlZlPQnPvsHDYYMCxY4qPLut+a335nyq777eTXwquWUm3bLu3HGpMvLLbe4vu1GGpqoPt6/lOGYkZ4/Ecb8vfaZo8oB6mGG5p0h9+35MtwxA/jGR92TdSs6RO0kgi1PS2kcv3NVKWpUoSx3ZK8c0tSVId9+VC7coIG5AyS+iU25RDq1Tc/vb1LU6RH/ar17MIP/rSAfflqxL+g4P0i0/q4bfcFVpGu0IZoZ4a+rabNA4lgl0/Va8/dyLvsVguUA7AgspVpyGUlSCGOd29mPHCamh2SYMkcDpO0RWrfjmiV1s1Zl3jJ351eDmLxPGysNtmbgleDrTty13068dt/3Hq2mSufwrPYWz+uP24r6TqumyR2Fq6pp80Enfw2PMrMSo6jlMiyvF3UiFd1O/MrKSp3UkHw+L8Jwa+KUciU0trfM/rb5v8Av0NITcWfSjGam8nftnr8QRGNzFBqdeMNbro35uZAQvnlTckmAsQGQktGzBSSCjvtnPzPVaXJpsjxZVUl2/l6eR2Jpq0MoZT1P4nK7ITGPcP1ZjF0RJWUIz0sfef15VmFfd/KcppdvZ+n/wAsunZRqiDKio/s/SMpxlXBFt+P7MiXQmPUm4xjMzQYxjAGMYwBnjrvnuMAo2XbJsD+z9WTJU3/AB9mI02y7laKpbkeMp3mP4f7+3J6NvlWqJTPcYxkEjGMYAxjGAU0/jkAU+45WbYy/MV5SlMZ/DIMmzv7PdkEa75ZPbcqyZXT2/q/pyfgDGZt2XSoYxjIJGM85h4e3PcAYxjAGMYwBjGa+7bu1KtoNfnfae3MGWjUDbNMw6GWUjrHWQkcz/eANyQM6NLpcupyxw4YuUpOkl+unm+iW7IcklbIe2/tUq6BAGcCxcmDClUDcrSkdDLKw3MVZSRu+3U9BuTnCHF/EVnUrFi1bk72edyzkbhI1H9rghViTHAi+qq7nYDqSSSb7xbxvFqkstjUKLy2peUSTUdUsVeifYSOC9FcjijA3HKoAG5223y2NDpL7cs+r1Pf39KlqEY/GWC3XcAe8RMfuz9p8OcDx8JxXOEnll8U6UvpFRcmo/RNvd9kvPy5HN+hkXC3EcUa6KRqc2mpp7SG5Sjivu15jfmtvNXFaM1bL2IJYqzrbeIKK4G7JsBj0k2kSGQ9zq9AszFEhmoanBEGO4VY3ipvyLvsF5ydgPW9uVmg8Hw3J4Ia+q0H72RU/OVtUgsgeLmOvLUEM0oUMRGs+7Ebbjxyi1GhSkryWKDXlWCzVrTx6g1V2kF2K3JWmgkqogRv3DMGhYMRzKQ7ANt6kY4I5G4uabq9nHl5pNq04q7k2lzc303vM8OkUWA7rVYkY7epqGm6hVI39hegtyPf/tbffkX9iE7ECCfSrpPgKur6eJD9wr25op2P4R5j2eEb/fnU8eRdJ3/iSf8A8VD94L5qHCGpQ7mXT9RRR/znmVhoD96zohiYfeGOWihfaKSN4ZTFNDIskbxvyyRSRsGRl26hgwH6si021JXJavJLVY7btWlkrudvDd4WU+0/ry+f2c6mV5ZLkttPArqMdfVFI9xGpxTbr93hmM/bVTUZfnH7VP8AeSU2rcTWZ1jQmCCOOUzhKNSrp8bTlShtSJSjQPY5CVDH7IJChQzb9J+Qxrluc67HYs2rMcSaY8K2bEs4iaY6gspjMrEoGEMW4HT1BnObcSo+wm0zRZgCOYx1rOnSH3nfSbUEYO3vQj7s6O8iG3Vkk17zapJSYRaV3oN57kTgvqXdmNZYVkiIPPvzO++67bbHf5bxRBLhmVezquXdctL34+qlv8jXD8aL95auvvWp6fGIq9mG1ddLENoWDFIIYWmjJ81nik3V1BGz9CAfEAjlP8pac/KH0uWJiVXm07VrEe5YgAJDqUFwncnovNudwN86m8tinXlraSLFsUQLsxjdqs9pHbzZgUcVt5IxsSeYK3h4ZzSNcracOXS2axcI/O6vLC8LQ8w2aLRqsw56fQlTbkAmbduQQgjOPwuo/s+CjGTk3Lo5Rj16tqovavN+hbN8RVcS8K6bWEAlu6hRsv1lo2KFe/ZpqVDRvekq2ohWdgR+Z5DKAwLIg8bIOHoXJ7nVdJkA8BZOoadIfxN2msAP4SnLCSSSfWZiST4u7sx3J9rO5JPvJJy/61whPXSwzTUJXq8pv1q1rvbdAGRYGNlOQRsEnkjicwvKI3kUNt459DyyxpRnldvptGuvyut0lcvLezIQ8Fag4JhgW6o366dbo6mTt7k06xK5P3bb/dlp1bTZ639017NPrt+6681Y7+7adF65RPGreKq34gH9uXfTOJL1cAV72oVlGwCQXbMMew8FMUcgQr08CNstL2q7xfpTj97l+4ks/Rh7GH6CMrNKvz1tzWnsVCTuTUnlrEn3kwMp36D9WVus8SWrQ2sPDP1B7xqOnrZGx3A89jrizy+9e82PtBy1xkArzAsu4LANyll3HMoYqeUkbjfY7e4+GUduNTS+V2vul+4GVab2k6xA8Ukd+ZpYiWimsxVb08ZKlCVmvwyyAlWZSd+oYj252T5Pva/Br0PJJ3dfU4EBt1lJCSqCF88qcxJauxK7ruWiZgrEgo78UH8ksv8A6bqP/wDLtVQf6gT0yv0SKKtNWn0/W4K9mGQSQvbpalSkjYAggtXr2oCrKWVleTkZWZTuCQfneLcKwavHSjyTXSSj9pOKaa+u3VeusJuLPo5jMA7G+0eHVoVV5tN/KEab2oKN+C0jBSqm1AiuZo6zM6jaVVZGblPMOV2z/PzLNhnhm4TVNHWne6KWebfoOg9vvOSgMqbEW/UeP7cihi2/H/foMrzJIpTbPIYtvx/Zk3GMo2aJUMYxgDGMYAxjGAMYxgDGQT+BylyyjZVuidYT2/ryGF9vwOQ8x95yHL1tRWytxkuBtx+GTMyNBjGMAYyBpNsiVt8A9zxvuz3GAUbDbxyohTb8cjIz3LOVlUqGMYOVLAnJEk3u/wDPJbuTng+/+TNFEo2eoDv08cq8lxuvs6f7/fkzKyZZDGMZUkYxmBdtPafV0Gv3ku01mUMtGorBZLDqBuznr3VZN1LyEdNwACzKp302my6nLHFii5Sk6SX66eb6JbshtJWyLti7UKWgwo9jmmnlJWrViK9/PsRzybMQEhQEEuxA6gDcsAeKeLtVratYsWrOq3Unlbf93aT+ajUb8kET6fdnKQJuQB3Q8SepLE2DjLiW1qdia1clM08p6nqI40G/dwQJue6gQEgL95JJZmY2fP27w/4YhwzFzcz9tJe9JV/ljzRlt69X18kvPy5XN+hfhw1zDeLUNFn9y+fmg52Ox9XWYau36TkcvBGpgcwo2p08eeki6jHt48xk09pUC/eTmPZ4iAEMAFYHcMvRwR4EMOoOfQOGZdJqvWNv81KK+xkTpjJXdebvasyMkic/PXnjdG5o5E5uV43VlBDDYgr08MuWvcTXLgRbM5lRHeVEWKvXj7yQbPO8dWJFlnI6GVwz7Ejfqcn0+M9TiGyahqHLtt3ctqaxB8vYZ4j4f4uRLxXId++qaPbJO5abSaleQ7+O82mLXlJPvLE/fmE1NyUpY4tx6O918rht/mJIOHOHGsK000qUKMb8kt2dGdGkAJ82pQLs+oXdgT3Mf2R1doxsc94l1itIscNKpHVrRMWEthIZ9VuPtt31y4F3iXbwrQFYl3P2zs2UWv61PbdWnYERp3VeKJFhrVIgdxXqV02SvCOnRRuT1YsxLGPhHT4rNqjDNIYYp7cEEsgKqyrJIqkK0nqI7b8oZt1UsCQQCMylFr/e5n8NtRXRfxlL1f0Se7koaboroZEM0YYGSMSGEyKD6yCUKxjJHTm2O3uy9h9IcHdNbpsd9uSXTtUjX3HleKk5H3c2/wB/ty+WOGlnrzySUYeGpYbkEEHn0+p1691ZY7LT1mbVJJCbsBgiYyR8ke0xDKpKHLHDwVffmMEMV0Dfrp17T9SJ290dGxJJvt7Cu/Xwzmepw5N3Jwa23kl6+bg+vr5egDaNQYDutXhViRsuoabqNQjf2M9JLkYP3ltvvzo3yItGNeTXj5xQtK8WlBWo3I7PKUbUiRJGNpYftrt3iLv1232O3Luq6VZrf3TWt0+u37rqz1uvu/PovXOjvIEYGTiLYgjutH6g7j7eqe7PC8URl+y8rU3Je715f68ejio/xNcPxr9djI/Looyy1dJMUU0wjvTNIYonkEamq4DSFAeRd+m5zkIZ155dF+WGto7RTTVyb1hXaGaSElfNHJVmjYEp06g9M5to8NRVkjn1Qy143USVKEJCanqCHfkk2cEabQYj+6JVLMFPdI+4Yc3hbN7PhsObu5cqXV+89q/SS3dLcnN8Zi6OVIKkqykMrDbdWUgqw36bggH9GZdJx1yNamqU46N65zeeW0sPYjImmSzaWnSmj5KizTRozB2m2Usi8obMb1u8s8jOsFemhCrHBVRliiRBso5nYyTSbdWlkZmYkknwAoT7dvd0z6DJhhlSc19L+zp016O0ZGQrxXIebvquj2y3i02k068h9+82mJXlY/eWJ+/IW1TT2XZ9LMTe1tP1a3X9vsTU47w2/Tmatw/FsUNKrDo/5ME6a9JDZMgsNp4ljme6khEs51I9wdORCwVigjDLz5hh4X5hvFqGh2Pco1IUJD/2daiq7H8c8/HlwS6JxW3RtL5PlaSfnF7ruXPGg0l+UJPq9Tfx7+lS1CMfjLBbruB94iY/dno4fruSINV0twBvtaXUtPk/XYp+bj9ExyKfgbVFHN5hcmXx56cf5QjA95loGVAPvJzH7amJikoaGQeMcqmKQfijgMP1ZeNT+DJfycZfwv7kF/i4KvycxghiugE9dOvafqJO3uio2ZJd9v3pUHr4ZatV0mzW/uqtbp9dv3XVnq9fd+fReuUDxq3iA34gH9uXXSuIbtYAVrt+qo6Ba12zBGAPZyRSBSv3bbZEvaLun9Gvvcv3EkrhvXJ6c0FmnM0FiB+8hljIJU7bEEH1XjZSVZG3DBiCCDneHYL2tV9eg68lfUIEHntUMdiNwvnVbmO71WJHvKE8rb+qz8Ry8Z3JCDY8yvbb/wB3aVplp+vj+6HrecDwHhIPDK/hjjhadiKzHp1OCzCweGXT7Wp0XU7EMGjktT1mjZTsU7nlYEggg7Z4XGOF/jce8amvhknf0d8u37u3rpjnys+jOM1x2HdrdXX4n5VFS7CN7VNpO8ZFJ2WeCQqvf1zuBzcoKsdmA3UtsfPzXPgnhm8eRVJdUdaae6GMYzIkYxjAGMYwBjGMAYyntSeAH4n+bIq8hPj7Pbk1tZF70TsEYyCcEjp/+8hEskysPYB+O2Qxpvnsab5Uqu2aN0USsIu3hnuMZmXGMZDIehwCSx33ydGOmSMiZyf/ACy7RVMmlxkWUyrvlQuVaolHuMYyCRjGMApZl2P7MIm+Tp13H4f7nJMTbH9uaJ7FGtzxkI9mTKw8cn4yrkTyjGMZUsM01xx5O+n6pZntXNR12WaU7dLGniOGNSe7rwIaB7uBNzsvvLEkszMch7bO1qpw/HXM0U1qxZ73zavCVTmWHk7yWaZ+kMQMka7gMxL9FIDFdMHyuZf4Ei/jd/p+fW8B4LxucPxOgg0pWue4RtXvXM06tdtrXoY5MmPpIyz0T9F+N1z5jTvp+eeidovxuu/Mad9PzFPS5l/gWL+N3/qGPS5l/gWL+N3/AKhn0P7L8Xf3v+Zi/wBRlz4f0mZX6J2i/G678xp30/HonaL8brvzGm/T8h7JvKNk1fUKNFtLSqLRsDvl1FpzH3FOxa/tRqJzb9xy/aG3Nv122y49u/bvLoNyOqmnx3Q9GG4ZHuNXI72azD3YRa77geb777/vvDpnBKPiaOrWjcpe1lHnUefH8NtXd8vVPa7LXirm7dCg9E7Rfjdd+Y076fj0TtG+N135jTvp+bi7O+IDqNLT7ZjEBt1IbJiD94I+9UNyByo5tt/HYZiPbF20adoYZGbzy+V3jpQOOcbjdWty7FakR3HVgWIO6o2xzy8HFOOZ9R+GxTnLIm04qnTTp2+lLu7r1LuONK2tjC/RO0X43XfmNO+n4Pkm6KfG7rh9h/dGndf/APPzHNE8qLULcscNXQFtTyHaOKDUJpJG8AWIFL1Ixv1dtlUdSQM39purXlpz2L9WtVsR15bHmte01pU7uJpAktkxIpkJGx5FKj2M2dfEsviDh7itTkcXLoufG2/pGTdetV6lYezl0X2NV2/Ja0qXk73UuIpii8kZmu0pTGn+Tj7yieROg9UbDplNJ5JWiN9q5rbfjPpp/bp+VnYN29za9cNV9Pipr5lNcEiW3nJ7qSvHyFGhQDfv999/3ub2zk4jr+NcNzew1GSUJJJ0nF7P/DaLQjjmrSNJaV5N9KsAK2tcV1VA2C1tVrwRge7u4qYXb7tszns77O4dKktypb1C7LaiqwyveamWCVHsvFympVh5m3ty7s/MT6o3G2Zpmie3jt6m0G4tWOhFcBpQ2zI9t4CO9ksR8gRYHB27jfff99nJpHxLjGR6bG/aSkrabjG0mn1dd67ky5IbvY2lxxwXU1MV/OVbnqvJNTkUQua07pyCykVmOSCSZBuV7xHUHrtuARqq/wCS1pMzySzajxBPLK5klllt0JJZXPizu9Asx8B19gA9mX/tZ7Zk0ejplg1xYt6jBHLWrd6Y40HcxSzySTchbuozNGuwXdjIo6DcjW3AflVs88aarTgr15HCGzTeY+bcx2DzQScxkiB+0ysCBuQreGejwrhPHXpnn0ilyK1s426bvlTdtXfTq7q2VnPHzU+pldDyVtCRgzz6vZUb7xTW66Rtv4EtVqxygjx6OPv3yXP5KWiFmK2taiBJIRLNJkQexVMtJnIHvZifeTm96lhJFR43SSN0WSN42V45EcBkdHUkMhBBBHQg5bON9ZNGnqNoIJjToXLojLcglNWvJOIy+x5A3d7b7HbfwOeTDjfEp5FFZZczailfe/y6l/ZwXY0r6JWh77+d61v7+/07fw28fyfv4ZH6J+i/Ga58xp/0/Lr2B9t8uv2bNd6EdIQ1DaDpcawXImii5CrQJyj84TvufDLD2leU/XpWJ69GidR83lkgnsS2vNYDLE3JItdVhkedFcMpc8gJQ8vMpDH3Y6XxFPVS0i5nkilKS5oUk+lyvl38rsz5sVc3YqIvJN0RWDLc1xXU7q62NOV1PvVxp+4P4ZkFLsFhjGy67xWyf5OfUqlqD5e1ReL/APjmrfS3s/wPW/jKX+qY9Lez/A9b+Mpf6pnbPw34nn8Ub+c8T/8AsV9rh/SZteDyfdGPN5wr3eYksZKmj03O/jvNpOn1pSfbzc2/Xxy16p5LvD8n9r/KVPrv+57pk/R+7o5uma89Lez/AAPW/jKX+qZnXaL2+zadT4ctJp8U7axSmtvG1t4lrd0lJ+RXEDGXfzvbchfsffnHk4F4hwZYY3alkbUUskKbUXJ9Jcq91PrRKyYmr8vQl1/JS0RWUtZ1qVQwLRvapqkgB3KM0NJZApHQlGVuvQg9crdc8l7QJ2DRHUqAC7GOpbR43O/2z59DMytt09VgPu365rz0t7P8D1v4yl/qmPS3s/wPW/jKX+qZ1vw14nbun/zMf7ueiPa4f0mbF4X8mzTqE8FmpqOvQWIH543Wzp/4NG6nT9pImXdWQ7gg5u3OTfS3s/wPW/jKX+qZmHY35Qs+s361J9OhqrMlhzKl2SZl7iB5gBG1dQdygHj7c83iXhXjixyz6mFxhFtvnxukt30lb+ReGbHdL+J0FjGM+NNxjGMAYxjAGeM22e5T228B+k/zZKVkN0SGO/6TlXGAo6kfflHnoG/35o1ZmnRVGcfjk3KMQt7tvxysGUkl2LpvuAMYxlSwxkLuBkMUm/8ANk0RZMyCYZHjIJKbIk29uTWQHJbRn8cvdlaJoz3JMI6/tydlWWGMYyAMYxgDJQhGTcZNihjGMgDGMYByZ5ecLecaE2x5TW1BA3sLLLUYrv79mB/TmCdnHGXDFapBFqegy6hcRp++tR9wwmV7EskO/eWEYFInSPbb/mvHOqu32xoS1I118b1ZbKxwMsVp5Y7AjlkR4XpqZYX7uOX1hsCN1O4Yg81z6D2fkkjWOIUBO4Va0pVfuBk0hmI/Ek5+v+G+I49RwnHpcuHUVjbqeGM6lvJ/FDfbmpp7bJ/Lhyxqbaa38ys/4ROCf+i9n/u1f65j/hE4J/6L2f8Au1f65lu/IHAP8NcR/Kn6NkX9j3AH8N8R/Kt9Gz1PYaP+prv/AH/5leaXnH7GweyLj3gxrlXzbS30a73hjpz2IYxGZLCNX7sTQWJBGzrKybyBV9cDfc5gvlu/33g/zLT/ANc1LNOcVx1lntLpz2ZqolZaMlgBbcqDYRu6oibSM+5ACqdivQHcZuPy3f77wf5lp/67qWb6bhGPR8Z0+aE8kva4sm2STlKPLyOt918W6fR2Vc3KDXk10KG52+XIdN03TtNVqPm9CCtausVNl3jjCyLUAJWvH0P507ud+gjIBMfZJ5P+pasVsXTJptORu+aWdS+oXOc8zPFDL1Tn3J76bx5gwWQHfJsvk+27Om6ZqGmSedPYoV7NmjKUSUO8YaRqc3RHXc9IpNiNjs7EhcsPZh2w6vw+5gIksVon5JtN1DvYmrkbbxwPIploOP8AE5SnUnk3O+XfLPS5Y8BeNZuaXteb47t3vLo7+FyuHk0O69pddjtHs94C07R4u6oV0h5tu+lb85ZsEfvrFhvXk6k7LvyrvsoUdMu3E1Zpa9yNBu8tWxEg8N2khdFG/s6kZiXZV2taXragVpu6tBeaWlZKx20A+0yKGK2Ih09eMsBuN+U9Mz7PxHXR1WLUt6pS9qncue7b83fVevfsehHla93ofOfse44n0C35yldJ5FrTUpoLDPAV53iLgsFLRypJAoIKnwYbA9RuT0trf8EVf4wm/q2b17RuCdAKXL2o6ZRm7iCW1am81QzukEZZ2cx7NM4RNhzbnoBmlP7Mezv+Cx/Fdj+nP1D9q8O44/xGTh+XNNJRk4W0vT3Zx+6s4+SePZSSKX0trf8ABFX+MJv6tmmu2Tj2XXbTW5YI6rCpFVWKKRpVCQtM4YyOqlmLTP7B02Hs3O7/AOzHs7/gsfxXY/pzSHbRqWlWLcj6NB5pRFWFFTuTBzSqJGmkEbEkA8yDc7b8h6bbE+/4c0mix6vmw6HLp5cr9+fNVbbbzkrfyM8rlW8kzaHla/3NwV/mef8A8HScsOodi002j6ZqmniSwz1Wk1GoN3l9SaVTaqDxccqjmiHXpuvtXMg8rlCtfgwMCCukWFYHxBEWkgg/fvnQHky/3k0T/wB2f/WZs+fnxnPwvgek1GB/+bkUk+ko8+W0/wCfZ7mqxqeSSfkv3I5p8nHtufR2SrdZ5tKdvUYAySaazncywhd2kqEklohuRuWQb8yv1V2uWo5tF16SJ0lik4f1SSKSNleORH06dkkjdSQ6MpBBHQg5pjylewUy99qGjxfnvWlv0Il/uj2vapIOgs+JeEf2zqy+vusuluz3tVs0KWradLzWaF7TtRqwoW9fT7NqrNEkkHN4V2lcd5F7CS67NzrJGo4TpePcnE+HVHJGUXmxbJ3abf8Ai730mv71plN47hL6Mz3yGf746j/mlv8AW62aS4Ft047FSTUoJ71Mc7W4IJWinn5oJO75ZRIhH59onY84JCt477HdvkM/3x1H/NLf63WzSvAUdEWai6r50lEd4t3zYEWk2ryiMIu24IsdzzDbfYNn12Cv2lxC+Z/7vDtD4/gyfBVPm8qfWjB/DH6/wNq/2Z8Ef9GtS+ek+o4/sz4I/wCjWpfPSfUci7ns/wD8txF/3Jf9jjuez/8Ay3EX/cl/2OeXy4f7LiH+bL/+hpv5x+xD/ZnwR/0a1L56T6jnR2lcDaHrmn6E8mnk1YaCNpkMtm0r1ILMcBMTPDODI3LDCCWZvsePU7859z2f/wCW4i/7kv8Asc617LhUGn6X5iZWpijX8zM/9uMHdr3Rk6D1+XbfpnyXizM9PjxZMH4rHNSdSyyyLrFr3W5unXWu1m2FW3dP5UYn6P8Aw1/Bi/O6j/Wc5I8pPh6rpup369KIVq8UFV4ow8knK0lSORzzzMzHdiT1Ptz6D5wZ5YX9+tU/92pf6jFm3/h7xPV6niUoZss5x9nJ1KcpK+aO9NtWRqopQ2R1GOwDhr+DF+d1H+s5duEuyHRNOnjs0qK17EYkWOQWbkhUSoY3HJNOyHdWI6j25nK+z8BnufBZONa/JFwnnyOLVNOcmmn2abpo6VCK7IYxjPMLDIJW2yPPGUHJQJay+/JoOSHj2yEHbJq+hFlTkEsYb8ffiN98jyOhPUoXQjx/8jkyGbboeo/lypZd/HKaWAjw6j+XLJp9SjTXQqgd8ZLgj2H3nxyZlC4yVJN7sim8DlLloqyrZ74/fk6KL2nJSNtkxZ/eP1ZaVkKifjGMzLjGMYAyGR9hvkWSbQJ22G48TkrqQ+hGkoPt/X0yPLfkSSEeB/oyzgVUyuxlOlj3j9X9ByejA+GVaaLJ2e4xjIJGMYwBjGc3P2jaqlW2HtP5zBwnxXq6SmGqDM0f5CtaRcVFj5QsHn96oAQA7U3LBujYBtjtn7NodfgrwTzz1VhtLbV66xs5ZYZoOUiVSOXaZj4ewZqd/JNoD/0nqX+ip/7LNjU9RvRHRxJ+WKxs68as8WsNoks8lddG1KyBG2jNJCsBmhjPVhJzQnfZSObCeKOM9QVIiLepo/ccSGI0INMcNZr8Rppmli4b0JjjqqJY4i5KKFLM7AAuPc0HiPiOixex0+VxgrdJLv16pmcsUZO2i3L5J1D+E9S/0VP/AGeeHyT6H8J6n/oqf+yzOdU40sRaxp9VpoViMVSpdro8XrW79TVLRnUOvfcsL0dNiUAgEasSw9UZlvZlfmswWHmfvGXWeIqqsQqkQ0tf1KnVj2QAbJBBEm/ieTc7kk52/wC2fF/7eX5R/wBJT2EPI1pwP5NGl0rEE8tm9fMEiTQwz9xHB3sTB43lWGMNKFYKwXmCkjqGHTL52w9hVXXbSWprluq6VIqgSukDIVilnlDkyoTzb2GH6BmUdrd25XrwtQYiz59W5E5UbzpULzPSPOpCLOIu55x1XvdwdxmuafGmoXG0vzeTVrsNp+OJgNG/IcU8sGm8SUqekzF9ZaOE1Y6djkHI3OxlRiG2Zhxy8S8SlqFqXml7RJxT22T6pKq377FlhjVUbh4I0BNOqUqkbvKlStFWR5AA7rEoUMwXpzbD2Zjvap2U6Zri/uqLu7Kryw3a/LHbiA8FL8pE0QJP5uQMvUkbHrlLrEVsarpMK6lqMdexpup3ZoANOKmTTJtAgiTmNMuEkF2yZAG3LSeqVAAF54w4lKV9YEK24Z6unXp45pKU8dcPDA7I0NiaLuZyG5SApbcKehAOebi1+oxZ/wARjm45Lb5k6dvr+fddC7imq7Gn6/koUkZHTVtVjkRg8bxrVjkjdTurxukYZHB8CCCM3lwdpNipEsVi7NqZTYJPZiijslR+9laABJSOnrFQ3Q8xYnfNVVeNbtetrws2L1OWOpowofliPS3u1J9blsadXvGTSualNpzWljKKzNIrVrPecqGMCtPEN+3R0iWtZv2UT8oVNVn0dNKbUpp9OkfT/wAoxwXk7mekLEE0rxV152M1fkVk50br4hxzW6+KjqZ89dLUbXyaSf3IjjjHobM4z0Nb9W9Vd2iS3UnqO6AF0WxG0TOoboWAbfrmg/RJpfwrqH+gqf8A45m1PX7lq/AsE2rXqTaVw9cWfShodeh+75dQMty3Hqcgt+byxwxPyV+8ZURgBzEc0zgjW7zWKzy3bFiK3r3FeltVljpivBFpl/VVoyV2irrOsiRaekZ55GDCViQSFInhvHtdw+LhpcjgpO2kk7f1TEscZdUYL6JNL+FdQ/0FT/8AHLhw75KumQyxSWLl69GjBzXkWvFDNyncJMY052iPtVSu/hvtuDuHtHNzzSx5g3LaLVxHyvWSZozZhFqOo9wGut5q3frCZh3fetHz+rvmubfEF1k0yOCbiCxIdQ1KC3Xhh0GrrUPc1u/Snba+60ZBD3kbCWJt5EMRBk3Zn75+MeMTi4vPKntsop/mopr6FfYQ8i89tvY9BxA1FpbVin5mllEFeOJ+cWTAW5u8B227hdtvecy7s64YTS6dOnHI86Vo2jWSQKruDI8m7BegPr7dPdmt+L+Lr9O20PnE6wnStDiRbEdV7Md+1avSGSWSuhi55amnXUfbePnSIJtzdcm0pLX5Y1CFtRvyVoNM03Uo6zCj3Peahd12CWJmSoJjAiUq3IOfmHId2bc54+TiepyaaGllO8UG3GO2zd32vu+/cuoJO+5sLNKdr3k7UdWmezXnbSrUp5rDRwLYrWG36zSVueMicjxdHXmPVgx3JouFtb1ixosdgz6zUu26PD4guagvD8tRptTnpxy26FegXkWH8+W5LSIeWRBy8wbluHDvFt3U5EQWbNBLGvmuyxJUNqnFDwnS1GbTg8kUiKyai83O2zNujKG5SNnD+J6nQZfa6abhLpa7ryadpr0aYlBSVMyDsT7IanD6zmKWW3ZsCNZ7EyonqRlikUMSdIouZmY7lmJI3JAUDFO0DyaNM1CxNYhsW9OeeRpZ44VhlrtI5LSSxxyrzRMzEsQG5dz0Ay96hb1LzunWFnV9QVNIoPYsaOvD9WN5pLVqCa/cj1KQFUdYVPJV59uR9lHqg7Xcbg9SNwRuPEfeN/bnXj8Q8Qx6mWqjmksk9pS23XZNVy0u223Yq8Uaqtjmv0SKn8LXvlq2PRIqfwte+WrZl092/BDxe/5U1OdtOuLp9Lmj0xnjWTStCvGZQlJRJbEt2wF5902cAqdgcp7HF10Ub+2oTVDY4hTSNIu6rHp8d2BIREdU7+KGMQBo2qawsRljDFYkZg68rv6H+2vGf7d/5Yf6SvsIeRjPokVP4WvfLVs35wPoK6fUo1EdpVqVYaqyOArSCFAgZlHQEgezNM8fdp15a3n1WV69abgx7wj7uu3mV+7T1C1RuSNIpbnjsUY6nd7spe6u6nl3GbahqN2HU/3RYuw0pbtaDT2rxULGlOr0UV9P1H1fPqmovcM7rN0i281QNzO8b+dxLj+u4hBQ1WRzUXaVRVPp2SLxxxj0RsnNJ9q3k81dauWLkt+3WadIY2ihigZFEMKwjZnHNuQu/wCnPdG4h1FdM1G6Z9XNhNGvTwS3otIbTROOYxTVoqqiw5QoCqzbAqTzAnbJuv6zerO9bz7VLCJxDWq97XgoS6o9WbhybUXroBVEUu1mPn37vn5d1B6DOXh/E9ToMntdNPkk1y2qezp1un5ImUFJUzcwGM0O3E+uSVqskNhpe/0O7ZMnLp/eRVbNuZ9N1yXu17iTUIqEdYvDD+aaSzJshULtkXDOq37N6RWOsyV4jpm71W0JNMi7zS6lqVLi2WGoOzySMT3CMNpVAI2O3CWNrYzRvCPEurWHupXtWHmGma7P/wAbxUIdNgsLqNipoclV68S2JIQ1ewsm/eAJEC3Kzx95nXZRqEzi3FZl1Q2IWrmWtrEWnC9VEsRAcWdK/clyrM0UkivHvyt3q7jlEcYGcZKaX3ZNyWYslUQyUTg5PCgZIJyyZDRNgHjkzPEG22e5VlhjGeMdsgHuMpWkO++T45N/xyXGiEyPKSRdj+zKvJdhen4ZMXTDRBEgI+8e7DQe45DAev7cqcN0yErGMYypYYxjAGMZJnm26AdfvyUrIbojlC+3b+fKR9vZvt9+eMxPj1yKOMn+n2ZdKijdkGVFMeP6P9/5cp8q6o6fiT/R/NiXQR6k3GMZmaDGM8ZtvHAPTmHz8CaY/NzU4TzaZLorEmQM2nzuJJaRYPuYS439467EbnMuEg94yTOvtH6ctH1Kssf9hdJoXhZbLxvKk+8mo6jJYiljGyS17b2DYqyDrs0Toep953mycGUDH3RqxGLzGfTTGeYq1S0VazA4LeuJGRSzH1idzvuTvdoH2/A5U5DVEp2WCbheoRKGgDCa5U1CYs8rNJaoeZinZZy/N3kf5Pp7ddj3A333O8rR+FalaWSaBJonklszuouXWrmW5K89mQVHnNdXeWSRyQg6uSNt8yTIGjH4ZKYop7VOObu+8Tm7qVJ49yRyyR78jjY9SNz0PTrmP/8AB1pgWuiV2rrW8+8380t3abRDUrCW7yK9WZG7uWeONyhPKCi7AbDMqRds9ypJbpdMrq8E7qokq1rFeGaR2JhgnNd7Cs7t1VjTrks25/NA7+O8N6zUsQyrJLXlrzQOkv55O7khlSRX3dW+wyJL1B8Eb3HJ2vUTPDZiDBDNXmhDFecIZY2QMU3HOBzb7bjfbxGWDVuEjYBZ2gSVqWpVj3MTpEJLrN3E5Xn9eSFJrac52LG5MRyd4y5vihja9+Vf9v4vb0IZJ0rQ9Fh5O7NUvI0FlJJbzWZ5vMO9NZu/nmaWaCAmcqhYoh5yADuch1PQtEtchdqxZ3t2o5IL715m73l8/Mc9WdJPNpPNV72NW7tu43dTy55W4MkXvfzwPe1rEDfnLeyGefUZgWUzEXNvPwN5tzvEWHKW6VNHhN45ObvlkjaZZZTKskk791LbkhQyM/XZbKRlm39SuBt6241liwK6m35bdf1+vMi2RT8MaX3yOAIJooqlYLWvWaarFUbepBJWqzpG8aG2AqMpG1gDwYAx8L8O6Wsj2qawyO8lp+9jsyWo0kuzec3TArStFWaaVg790F5iRvv0yjHAoXzMpKENc7ElXkaWM21tPCzu5blLRVvEnrXT2DbL7wnpj1YIonbnMaKnN3s8vMFRV35rLs43IPq77D2ZnkhiUbjK3fSq233/AHEqyDVRRvRPFK9azCzAsFnXYPXlidXSSNw0U0U3cMHUhkfuyCDynLA+g6J3dePvYkWOzP3Lrq1iO09qRQtnvLqWhYs2WV1Dd47MQV39mVWlcHlBVEjxMK9wWESJJVjESVEiSspkleQwi1DBZVHZwncRINxEhFRofDckKxK0kREVtZ0WJJBGiJVNfuo+9kd0TmZnVCzCNSI19VQcvLHgV1Jv9P069PT1ItlNPomjTqwfzWwAtCKRpLhmk/4mmazSEkzTFzJDLKz8zHmJkPMTvl8oQVHmsTxGGSwY4qNqSOQO6pVksSxV5QrEIyPbsHYgH86d/ZmNycC95Gkcrhu6htJCxksycskywLFMFklIjVO5P5pfU9foBl/4e0l4GslnQrI/NGiLJsn5yaRmDTOzKG70DulPIpQlQOdgIyY8KjcZNvyr1/luTbLHpPDmjVYzFG6LBHJUrdzLqlmaGvJUlSWnXjisWWWs6PDHsihT+bA2IG2QXNA0LksqTUgB1SW/YeG81SaLUp45Ip5xZrzpLVtSRrOjBGUsplBBBcGbqfCM0sckfe1lBltFG83fvO6tx6nG4kfvfWdBqW6jbYmF/DvfzcK8C7yOzzuyPamsOqtPHIFmbWC0MciS/mk21RPsAHeJ+p5lCX9lp63m/wAv+hFs8v8AD2irybtDU8ygr0AK2pT6etaGPmetVlSpYjUIN3Kq49+2ZTc1WvEHMs8ESoVVzJNGioXHMiuWYBSR1APiMxeXgpz3oWx3Kl5mgaBXjmi75tTl351cdVfUVAA2BWA7/b2WfV4RZPPSJhvbhjidBEohhFYlKggC7PyrA3I3OzblFI5R0yHiwf1/t8vTyv8AcLZPsQ6UV1FXkp8lmSG5qQNpAHZ4KteCeY95+aVoq1RVPQEIu2+/WdX0XTu/aVI67WRbe8xWTmdbRoQ6fJZ7sPss3mUkERYAepMv+PuWpaNPJJNIrwLuaLQhlkPWnNJKRKQw6N3rgFfDYHr4Zb+GODPMpe9WUOpqrXdRFyF3SDTq4sMQx9cx6fGp8eixjf1etVjw8rfNvWy626W3pvYtlRqHAulzxWIJakUsFiLuZ4pGkKSR+dy3gmxf1QLM0kg5duUkbbAACqk4QotY86MH7o7xZiRLOIXmSJa6WpKgk83kuLCqxidkMiqoUMAAMumejMOUWU0eg1RXNXuY2qNA9Zq7jvInhlVlkidZN+dGVmBB33Byi0rg6jXEYjhO8dvz9ZJp7Fmw1nzZqQsS2bEjzTyCsxhHeM2yKqjYKoF4DnI45N8q0TZaanCtKNWRIFVGgnrFQ8hHc2ZpJ5oVBb1IzJK5AG3KCAuwAAkpwbSWUTrHNHKDAT3V29FE5rRpFD3teOcQzcscaL66ncKAd8yDGQSWJuDqBChqsLqK9+pyuGdGr6nKk1+vIrkiWGaSNGZG3Hqjwyfw1w5WoiQV0dTKweV5rFi3PIVXlQPYtyPKyIvqqhblUdFAGXbGAMYxgHjjfJSxncZOxkpihjGMgDGeMdslmcffkpEWQSxbeHh+zJYOTTP92SmP6PwzRX3KOioik3/H9uTMosqoG3H8mUlGiyZGBjGMqWGM4A9NXiH4Ph/5XUfqOPTV4h+D4f8AldR+o4B3/jOAPTV4h+D4f+V1H6jj01eIfg+H/ldR+o4B3/kqzHv4eIzgX01eIfg+H/ldR+o49NXiH4Ph/wCV1H6jhOiGrO+I6/v6/d/v45MlOwP4ZwH6avEPwfD/AMrqP1HIZPLT4hPTzPQPldR+o5Niq6HeOV0Q2A/DPn56ZvEHwegfK6j9Qyd6avEPwfD/AMrqP1HJk7IiqO/8ZwB6avEPwfD/AMrqP1HHpq8Q/B8P/K6j9RypY7/yXOpO23vzgT01eIfg+H/ldR+o49NXiH4Ph/5XUfqOEwd6FD7j+rPNs4M9NXiH4Ph/5XUfqOQP5aXEJ/wPQPldR+oZdSK8p3rlVCegz5/jyz+IPg9A+V1H6jkz01eIfg+H/ldR+o5EnYSo7/xnAHpq8Q/B8P8Ayuo/UcemrxD8Hw/8rqP1HKljv/GcAemrxD8Hw/8AK6j9Rx6avEPwfD/yuo/UcA7/AMZwB6avEPwfD/yuo/UcemrxD8Hw/wDK6j9RwDv/ABnAHpq8Q/B8P/K6j9Rx6avEPwfD/wArqP1HAO/8ZwB6avEPwfD/AMrqP1HHpq8Q/B8P/K6j9RwDv/GcAemrxD8Hw/8AK6j9Rx6avEPwfD/yuo/UcA7/AMZwB6avEPwfD/yuo/UcemrxD8Hw/wDK6j9RwDv/ABnAHpq8Q/B8P/K6j9Rx6avEPwfD/wArqP1HAO/8ZwB6avEPwfD/AMrqP1HHpq8Q/B8P/K6j9RwDv/GcAemrxD8Hw/8AK6j9Rx6avEPwfD/yuo/UcA7+IyS5HszgV/LT4hP+B6B8rqP1HPPTQ4h+D0D5XUfqOWRDO+cii8c4E9NDiD4PQPldR+o56vlpcQj/AAPQPldR+o5LaISPoDjOAPTV4h+D4f8AldR+o49NXiH4Ph/5XUfqOULHf+M4A9NXiH4Ph/5XUfqOPTV4h+D4f+V1H6jgHf8AjOAPTV4h+D4f+V1H6jj01eIfg+H/AJXUfqOAd/4zgD01eIfg+H/ldR+o49NXiH4Ph/5XUfqOAd/4zgD01eIfg+H/AJXUfqOPTV4h+D4f+V1H6jgHfzjofwOUoU+4/qzgv01eIfg+H/ldR+o49NXiH4Ph/wCV1H6jkp0Q1Z3sIj+H45C67ZwX6avEPwfD/wArqP1HJZ8tDiD4PQPldR+oZZSIcTvdRvlWq7Z8/k8tHiEf4HoHyuo/Ucj9NXiH4Ph/5XUfqORJ2SlR3/jOAPTV4h+D4f8AldR+o49NXiH4Ph/5XUfqOVJOZsYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAMYxgDGMYAxjGAf/9k=", - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import YouTubeVideo\n", - "\n", - "top_match = context.iloc[0]\n", - "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=int(top_match[\"start\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78b7eb11", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From bfe8fccfab0038c56b634b878ff18d5f961dffc2 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Thu, 29 Aug 2024 15:16:27 +0530 Subject: [PATCH 28/34] docs: add hnsw docs (#1570) --- docs/mkdocs.yml | 8 ++- docs/src/concepts/index_hnsw.md | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 docs/src/concepts/index_hnsw.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 0b6be3e5..77dcb19d 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -84,7 +84,9 @@ nav: - πŸƒπŸΌβ€β™‚οΈ Quick start: basic.md - πŸ“š Concepts: - Vector search: concepts/vector_search.md - - Indexing: concepts/index_ivfpq.md + - Indexing: + - IVFPQ: concepts/index_ivfpq.md + - HNSW: concepts/index_hnsw.md - Storage: concepts/storage.md - Data management: concepts/data_management.md - πŸ”¨ Guides: @@ -193,7 +195,9 @@ nav: - Quick start: basic.md - Concepts: - Vector search: concepts/vector_search.md - - Indexing: concepts/index_ivfpq.md + - Indexing: + - IVFPQ: concepts/index_ivfpq.md + - HNSW: concepts/index_hnsw.md - Storage: concepts/storage.md - Data management: concepts/data_management.md - Guides: diff --git a/docs/src/concepts/index_hnsw.md b/docs/src/concepts/index_hnsw.md new file mode 100644 index 00000000..9e8dc948 --- /dev/null +++ b/docs/src/concepts/index_hnsw.md @@ -0,0 +1,87 @@ + +# Understanding HNSW index + +Approximate Nearest Neighbor (ANN) search is a method for finding data points near a given point in a dataset, though not always the exact nearest one. HNSW is one of the most accurate and fastest Approximate Nearest Neighbour search algorithms, It’s beneficial in high-dimensional spaces where finding the same nearest neighbor would be too slow and costly + +[Jump to usage](#usage) +There are three main types of ANN search algorithms: + +* **Tree-based search algorithms**: Use a tree structure to organize and store data points. +* * **Hash-based search algorithms**: Use a specialized geometric hash table to store and manage data points. These algorithms typically focus on theoretical guarantees, and don't usually perform as well as the other approaches in practice. +* **Graph-based search algorithms**: Use a graph structure to store data points, which can be a bit complex. + +HNSW is a graph-based algorithm. All graph-based search algorithms rely on the idea of a k-nearest neighbor (or k-approximate nearest neighbor) graph, which we outline below. +HNSW also combines this with the ideas behind a classic 1-dimensional search data structure: the skip list. + +## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs +The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple: +* Each vector in the dataset is given an associated vertex. +* Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex. +* For some applications (including nearest-neighbor search), the incoming edges are also added. + +Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors: +* Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node). +* Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex. +* Repeat until a local optimum is found. + +The above algorithm also generalizes to e.g. top 10 approximate nearest neighbors. + +Computing a k-nearest neighbor graph is actually quite slow, taking quadratic time in the dataset size. It was quickly realized that near-identical performance can be achieved using a k-approximate nearest neighbor graph. That is, instead of obtaining the k-nearest neighbors for each vertex, an approximate nearest neighbor search data structure is used to build much faster. +In fact, another data structure is not needed: This can be done "incrementally". +That is, if you start with a k-ANN graph for n-1 vertices, you can extend it to a k-ANN graph for n vertices as well by using the graph to obtain the k-ANN for the new vertex. + +One downside of k-NN and k-ANN graphs alone is that one must typically build them with a large value of k to get decent results, resulting in a large index. + + +## HNSW: Hierarchical Navigable Small Worlds + +HNSW builds on k-ANN in two main ways: +* Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant. +* The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list. + +This recursive structure can be thought of as separating into layers: +* At the bottom-most layer, an k-ANN graph on the whole dataset is present. +* At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present. +* At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer. + +Then the greedy search routine operates as follows: +* At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer. +* Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method. +* Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10). + + +## Usage + +We can combine the above concepts to understand how to build and query an HNSW index in LanceDB. + +### Construct index + +```python +import lancedb +import numpy as np +uri = "/tmp/lancedb" +db = lancedb.connect(uri) + +# Create 10,000 sample vectors +data = [ + {"vector": row, "item": f"item {i}"} + for i, row in enumerate(np.random.random((10_000, 1536)).astype('float32')) +] + +# Add the vectors to a table +tbl = db.create_table("my_vectors", data=data) + +# Create and train the HNSW index for a 1536-dimensional vector +# Make sure you have enough data in the table for an effective training step +tbl.create_index(index_type=IVF_HNSW_SQ) + +``` + +### Query the index + +```python +# Search using a random 1536-dimensional embedding +tbl.search(np.random.random((1536))) \ + .limit(2) \ + .to_pandas() +``` From 15214351935682e16f570dc14f88b66d15a3d476 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 29 Aug 2024 23:43:46 +0800 Subject: [PATCH 29/34] fix: specify column to search for FTS (#1572) Before this we ignored the `fts_columns` parameter, and for now we support to search on only one column, it could lead to an error if we have multiple indexed columns for FTS --------- Signed-off-by: BubbleCal --- python/python/lancedb/query.py | 36 ++++++--- python/python/lancedb/remote/table.py | 4 +- python/python/lancedb/table.py | 5 +- python/python/tests/test_fts.py | 105 ++++++++++++++++++++++++-- python/src/index.rs | 7 ++ 5 files changed, 136 insertions(+), 21 deletions(-) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 53bcb434..9c33740a 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -132,8 +132,8 @@ class LanceQueryBuilder(ABC): query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]], query_type: str, vector_column_name: str, - ordering_field_name: str = None, - fts_columns: Union[str, List[str]] = None, + ordering_field_name: Optional[str] = None, + fts_columns: Union[str, List[str]] = [], ) -> LanceQueryBuilder: """ Create a query builder based on the given query and query type. @@ -156,7 +156,9 @@ class LanceQueryBuilder(ABC): if query_type == "hybrid": # hybrid fts and vector query - return LanceHybridQueryBuilder(table, query, vector_column_name) + return LanceHybridQueryBuilder( + table, query, vector_column_name, fts_columns=fts_columns + ) # remember the string query for reranking purpose str_query = query if isinstance(query, str) else None @@ -168,7 +170,9 @@ class LanceQueryBuilder(ABC): ) if query_type == "hybrid": - return LanceHybridQueryBuilder(table, query, vector_column_name) + return LanceHybridQueryBuilder( + table, query, vector_column_name, fts_columns=fts_columns + ) if isinstance(query, str): # fts @@ -176,6 +180,7 @@ class LanceQueryBuilder(ABC): table, query, ordering_field_name=ordering_field_name, + fts_columns=fts_columns, ) if isinstance(query, list): @@ -693,8 +698,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): self, table: "Table", query: str, - ordering_field_name: str = None, - fts_columns: Union[str, List[str]] = None, + ordering_field_name: Optional[str] = None, + fts_columns: Union[str, List[str]] = [], ): super().__init__(table) self._query = query @@ -887,10 +892,18 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): in the `rerank` method to convert the scores to ranks and then normalize them. """ - def __init__(self, table: "Table", query: str, vector_column: str): + def __init__( + self, + table: "Table", + query: str, + vector_column: str, + fts_columns: Union[str, List[str]] = [], + ): super().__init__(table) vector_query, fts_query = self._validate_query(query) - self._fts_query = LanceFtsQueryBuilder(table, fts_query) + self._fts_query = LanceFtsQueryBuilder( + table, fts_query, fts_columns=fts_columns + ) vector_query = self._query_to_vector(table, vector_query, vector_column) self._vector_query = LanceVectorQueryBuilder(table, vector_query, vector_column) self._norm = "score" @@ -1386,7 +1399,7 @@ class AsyncQuery(AsyncQueryBase): ) def nearest_to_text( - self, query: str, columns: Union[str, List[str]] = None + self, query: str, columns: Union[str, List[str]] = [] ) -> AsyncQuery: """ Find the documents that are most relevant to the given text query. @@ -1410,9 +1423,8 @@ class AsyncQuery(AsyncQueryBase): """ if isinstance(columns, str): columns = [columns] - return AsyncQuery( - self._inner.nearest_to_text({"query": query, "columns": columns}) - ) + self._inner.nearest_to_text({"query": query, "columns": columns}) + return self class AsyncVectorQuery(AsyncQueryBase): diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 596e7b81..4f5f6a0c 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -15,7 +15,7 @@ import logging import uuid from concurrent.futures import Future from functools import cached_property -from typing import Dict, Iterable, Optional, Union, Literal +from typing import Dict, Iterable, List, Optional, Union, Literal import pyarrow as pa from lance import json_to_schema @@ -268,6 +268,7 @@ class RemoteTable(Table): query: Union[VEC, str], vector_column_name: Optional[str] = None, query_type="auto", + fts_columns: Optional[Union[str, List[str]]] = None, ) -> LanceVectorQueryBuilder: """Create a search query to find the nearest neighbors of the given query vector. We currently support [vector search][search] @@ -338,6 +339,7 @@ class RemoteTable(Table): query, query_type, vector_column_name=vector_column_name, + fts_columns=fts_columns, ) def _execute_query( diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 26ab53a1..46df91c2 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -545,7 +545,7 @@ class Table(ABC): vector_column_name: Optional[str] = None, query_type: str = "auto", ordering_field_name: Optional[str] = None, - fts_columns: Union[str, List[str]] = None, + fts_columns: Optional[Union[str, List[str]]] = None, ) -> LanceQueryBuilder: """Create a search query to find the nearest neighbors of the given query vector. We currently support [vector search][search] @@ -1425,7 +1425,7 @@ class LanceTable(Table): vector_column_name: Optional[str] = None, query_type: str = "auto", ordering_field_name: Optional[str] = None, - fts_columns: Union[str, List[str]] = None, + fts_columns: Optional[Union[str, List[str]]] = None, ) -> LanceQueryBuilder: """Create a search query to find the nearest neighbors of the given query vector. We currently support [vector search][search] @@ -1505,6 +1505,7 @@ class LanceTable(Table): query_type, vector_column_name=vector_column_name, ordering_field_name=ordering_field_name, + fts_columns=fts_columns, ) @classmethod diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 9cfda85a..54ba9cf4 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -29,14 +29,26 @@ def table(tmp_path) -> ldb.table.LanceTable: db = ldb.connect(tmp_path) vectors = [np.random.randn(128) for _ in range(100)] - nouns = ("puppy", "car", "rabbit", "girl", "monkey") + text_nouns = ("puppy", "car") + text2_nouns = ("rabbit", "girl", "monkey") verbs = ("runs", "hits", "jumps", "drives", "barfs") adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") adj = ("adorable", "clueless", "dirty", "odd", "stupid") text = [ " ".join( [ - nouns[random.randrange(0, 5)], + text_nouns[random.randrange(0, len(text_nouns))], + verbs[random.randrange(0, 5)], + adv[random.randrange(0, 5)], + adj[random.randrange(0, 5)], + ] + ) + for _ in range(100) + ] + text2 = [ + " ".join( + [ + text2_nouns[random.randrange(0, len(text2_nouns))], verbs[random.randrange(0, 5)], adv[random.randrange(0, 5)], adj[random.randrange(0, 5)], @@ -52,7 +64,7 @@ def table(tmp_path) -> ldb.table.LanceTable: "vector": vectors, "id": [i % 2 for i in range(100)], "text": text, - "text2": text, + "text2": text2, "nested": [{"text": t} for t in text], "count": count, } @@ -66,14 +78,26 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable: db = await ldb.connect_async(tmp_path) vectors = [np.random.randn(128) for _ in range(100)] - nouns = ("puppy", "car", "rabbit", "girl", "monkey") + text_nouns = ("puppy", "car") + text2_nouns = ("rabbit", "girl", "monkey") verbs = ("runs", "hits", "jumps", "drives", "barfs") adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") adj = ("adorable", "clueless", "dirty", "odd", "stupid") text = [ " ".join( [ - nouns[random.randrange(0, 5)], + text_nouns[random.randrange(0, len(text_nouns))], + verbs[random.randrange(0, 5)], + adv[random.randrange(0, 5)], + adj[random.randrange(0, 5)], + ] + ) + for _ in range(100) + ] + text2 = [ + " ".join( + [ + text2_nouns[random.randrange(0, len(text2_nouns))], verbs[random.randrange(0, 5)], adv[random.randrange(0, 5)], adj[random.randrange(0, 5)], @@ -89,7 +113,7 @@ async def async_table(tmp_path) -> ldb.table.AsyncTable: "vector": vectors, "id": [i % 2 for i in range(100)], "text": text, - "text2": text, + "text2": text2, "nested": [{"text": t} for t in text], "count": count, } @@ -142,12 +166,81 @@ def test_search_fts(table, use_tantivy): assert len(results) == 5 +def test_search_fts_specify_column(table): + table.create_fts_index("text", use_tantivy=False) + table.create_fts_index("text2", use_tantivy=False) + + results = table.search("puppy", fts_columns="text").limit(5).to_list() + assert len(results) == 5 + + results = table.search("rabbit", fts_columns="text2").limit(5).to_list() + assert len(results) == 5 + + try: + # we can only specify one column for now + table.search("puppy", fts_columns=["text", "text2"]).limit(5).to_list() + assert False + except Exception: + pass + + try: + # have to specify a column because we have two fts indices + table.search("puppy").limit(5).to_list() + assert False + except Exception: + pass + + +@pytest.mark.asyncio async def test_search_fts_async(async_table): + async_table = await async_table await async_table.create_index("text", config=FTS()) results = await async_table.query().nearest_to_text("puppy").limit(5).to_list() assert len(results) == 5 +@pytest.mark.asyncio +async def test_search_fts_specify_column_async(async_table): + async_table = await async_table + await async_table.create_index("text", config=FTS()) + await async_table.create_index("text2", config=FTS()) + + results = ( + await async_table.query() + .nearest_to_text("puppy", columns="text") + .limit(5) + .to_list() + ) + assert len(results) == 5 + + results = ( + await async_table.query() + .nearest_to_text("rabbit", columns="text2") + .limit(5) + .to_list() + ) + assert len(results) == 5 + + try: + # we can only specify one column for now + await ( + async_table.query() + .nearest_to_text("rabbit", columns="text2") + .limit(5) + .to_list() + ) + assert False + except Exception: + pass + + try: + # have to specify a column because we have two fts indices + await async_table.query().nearest_to_text("puppy").limit(5).to_list() + assert False + except Exception: + pass + + def test_search_ordering_field_index_table(tmp_path, table): table.create_fts_index("text", ordering_field_names=["count"], use_tantivy=True) rows = ( diff --git a/python/src/index.rs b/python/src/index.rs index 884b2987..5a857561 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -98,6 +98,13 @@ impl Index { inner: Mutex::new(Some(LanceDbIndex::LabelList(Default::default()))), }) } + + #[staticmethod] + pub fn fts() -> PyResult { + Ok(Self { + inner: Mutex::new(Some(LanceDbIndex::FTS(Default::default()))), + }) + } } #[pyclass(get_all)] From dc72ece847bfced72cd7bf664cde595b5abee865 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Fri, 30 Aug 2024 17:37:58 +0530 Subject: [PATCH 30/34] feat!: better api for manual hybrid queries (#1575) Currently, the only documented way of performing hybrid search is by using embedding API and passing string queries that get automatically embedded. There are use cases where users might like to pass vectors and text manually instead. This ticket contains more information and historical context - https://github.com/lancedb/lancedb/issues/937 This breaks a undocumented pathway that allowed passing (vector, text) tuple queries which was intended to be temporary, so this is marked as a breaking change. For all practical purposes, this should not really impact most users ### usage ``` results = table.search(query_type="hybrid") .vector(vector_query) .text(text_query) .limit(5) .to_pandas() ``` --- docs/src/hybrid_search/hybrid_search.md | 13 ++ python/python/lancedb/query.py | 214 +++++++++++------------- python/python/tests/test_rerankers.py | 20 ++- 3 files changed, 131 insertions(+), 116 deletions(-) diff --git a/docs/src/hybrid_search/hybrid_search.md b/docs/src/hybrid_search/hybrid_search.md index 244e8740..1503a07b 100644 --- a/docs/src/hybrid_search/hybrid_search.md +++ b/docs/src/hybrid_search/hybrid_search.md @@ -43,6 +43,19 @@ table.create_fts_index("text") # hybrid search with default re-ranker results = table.search("flower moon", query_type="hybrid").to_pandas() ``` +!!! Note + You can also pass the vector and text query manually. This is useful if you're not using the embedding API or if you're using a separate embedder service. +### Explicitly passing the vector and text query +```python +vector_query = [0.1, 0.2, 0.3, 0.4, 0.5] +text_query = "flower moon" +results = table.search(query_type="hybrid") + .vector(vector_query) + .text(text_query) + .limit(5) + .to_pandas() + +``` By default, LanceDB uses `LinearCombinationReranker(weight=0.7)` to combine and rerank the results of semantic and full-text search. You can customize the hyperparameters as needed or write your own custom reranker. Here's how you can use any of the available rerankers: diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 9c33740a..7e7538b0 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -34,7 +34,6 @@ import pydantic from . import __version__ from .arrow import AsyncRecordBatchReader -from .common import VEC from .rerankers.base import Reranker from .rerankers.linear_combination import LinearCombinationReranker from .util import safe_import_pandas @@ -43,6 +42,7 @@ if TYPE_CHECKING: import PIL import polars as pl + from .common import VEC from ._lancedb import Query as LanceQuery from ._lancedb import VectorQuery as LanceVectorQuery from .pydantic import LanceModel @@ -151,15 +151,16 @@ class LanceQueryBuilder(ABC): vector_column_name: str The name of the vector column to use for vector search. """ - if query is None: - return LanceEmptyQueryBuilder(table) - + # Check hybrid search first as it supports empty query pattern if query_type == "hybrid": # hybrid fts and vector query return LanceHybridQueryBuilder( table, query, vector_column_name, fts_columns=fts_columns ) + if query is None: + return LanceEmptyQueryBuilder(table) + # remember the string query for reranking purpose str_query = query if isinstance(query, str) else None @@ -206,8 +207,6 @@ class LanceQueryBuilder(ABC): elif query_type == "auto": if isinstance(query, (list, np.ndarray)): return query, "vector" - if isinstance(query, tuple): - return query, "hybrid" else: conf = table.embedding_functions.get(vector_column_name) if conf is not None: @@ -238,6 +237,8 @@ class LanceQueryBuilder(ABC): self._where = None self._prefilter = False self._with_row_id = False + self._vector = None + self._text = None @deprecation.deprecated( deprecated_in="0.3.1", @@ -465,6 +466,36 @@ class LanceQueryBuilder(ABC): }, ).explain_plan(verbose) + def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder: + """Set the vector to search for. + + Parameters + ---------- + vector: np.ndarray or list + The vector to search for. + + Returns + ------- + LanceQueryBuilder + The LanceQueryBuilder object. + """ + raise NotImplementedError + + def text(self, text: str) -> LanceQueryBuilder: + """Set the text to search for. + + Parameters + ---------- + text: str + The text to search for. + + Returns + ------- + LanceQueryBuilder + The LanceQueryBuilder object. + """ + raise NotImplementedError + @abstractmethod def rerank(self, reranker: Reranker) -> LanceQueryBuilder: """Rerank the results using the specified reranker. @@ -895,40 +926,70 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): def __init__( self, table: "Table", - query: str, - vector_column: str, + query: str = None, + vector_column: str = None, fts_columns: Union[str, List[str]] = [], ): super().__init__(table) - vector_query, fts_query = self._validate_query(query) - self._fts_query = LanceFtsQueryBuilder( - table, fts_query, fts_columns=fts_columns - ) - vector_query = self._query_to_vector(table, vector_query, vector_column) - self._vector_query = LanceVectorQueryBuilder(table, vector_query, vector_column) + self._query = query + self._vector_column = vector_column + self._fts_columns = fts_columns self._norm = "score" self._reranker = LinearCombinationReranker(weight=0.7, fill=1.0) + self._nprobes = None + self._refine_factor = None - def _validate_query(self, query): - # Temp hack to support vectorized queries for hybrid search - if isinstance(query, str): - return query, query - elif isinstance(query, tuple): - if len(query) != 2: - raise ValueError( - "The query must be a tuple of (vector_query, fts_query)." - ) - if not isinstance(query[0], (list, np.ndarray, pa.Array, pa.ChunkedArray)): - raise ValueError(f"The vector query must be one of {VEC}.") - if not isinstance(query[1], str): - raise ValueError("The fts query must be a string.") - return query[0], query[1] - else: + def _validate_query(self, query, vector=None, text=None): + if query is not None and (vector is not None or text is not None): raise ValueError( - "The query must be either a string or a tuple of (vector, string)." + "You can either provide a string query in search() method" + "or set `vector()` and `text()` explicitly for hybrid search." + "But not both." ) + vector_query = vector if vector is not None else query + if not isinstance(vector_query, (str, list, np.ndarray)): + raise ValueError("Vector query must be either a string or a vector") + + text_query = text or query + if text_query is None: + raise ValueError("Text query must be provided for hybrid search.") + if not isinstance(text_query, str): + raise ValueError("Text query must be a string") + + return vector_query, text_query + def to_arrow(self) -> pa.Table: + vector_query, fts_query = self._validate_query( + self._query, self._vector, self._text + ) + self._fts_query = LanceFtsQueryBuilder( + self._table, fts_query, fts_columns=self._fts_columns + ) + vector_query = self._query_to_vector( + self._table, vector_query, self._vector_column + ) + self._vector_query = LanceVectorQueryBuilder( + self._table, vector_query, self._vector_column + ) + + if self._limit: + self._vector_query.limit(self._limit) + self._fts_query.limit(self._limit) + if self._columns: + self._vector_query.select(self._columns) + self._fts_query.select(self._columns) + if self._where: + self._vector_query.where(self._where, self._prefilter) + self._fts_query.where(self._where, self._prefilter) + if self._with_row_id: + self._vector_query.with_row_id(True) + self._fts_query.with_row_id(True) + if self._nprobes: + self._vector_query.nprobes(self._nprobes) + if self._refine_factor: + self._vector_query.refine_factor(self._refine_factor) + with ThreadPoolExecutor() as executor: fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow) vector_future = executor.submit( @@ -1034,87 +1095,6 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): return self - def limit(self, limit: int) -> LanceHybridQueryBuilder: - """ - Set the maximum number of results to return for both vector and fts search - components. - - Parameters - ---------- - limit: int - The maximum number of results to return. - - Returns - ------- - LanceHybridQueryBuilder - The LanceHybridQueryBuilder object. - """ - self._vector_query.limit(limit) - self._fts_query.limit(limit) - self._limit = limit - - return self - - def select(self, columns: list) -> LanceHybridQueryBuilder: - """ - Set the columns to return for both vector and fts search. - - Parameters - ---------- - columns: list - The columns to return. - - Returns - ------- - LanceHybridQueryBuilder - The LanceHybridQueryBuilder object. - """ - self._vector_query.select(columns) - self._fts_query.select(columns) - return self - - def where(self, where: str, prefilter: bool = False) -> LanceHybridQueryBuilder: - """ - Set the where clause for both vector and fts search. - - Parameters - ---------- - where: str - The where clause which is a valid SQL where clause. See - `Lance filter pushdown `_ - for valid SQL expressions. - - prefilter: bool, default False - If True, apply the filter before vector search, otherwise the - filter is applied on the result of vector search. - - Returns - ------- - LanceHybridQueryBuilder - The LanceHybridQueryBuilder object. - """ - - self._vector_query.where(where, prefilter=prefilter) - self._fts_query.where(where) - return self - - def metric(self, metric: Literal["L2", "cosine"]) -> LanceHybridQueryBuilder: - """ - Set the distance metric to use for vector search. - - Parameters - ---------- - metric: "L2" or "cosine" - The distance metric to use. By default "L2" is used. - - Returns - ------- - LanceHybridQueryBuilder - The LanceHybridQueryBuilder object. - """ - self._vector_query.metric(metric) - return self - def nprobes(self, nprobes: int) -> LanceHybridQueryBuilder: """ Set the number of probes to use for vector search. @@ -1132,7 +1112,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): LanceHybridQueryBuilder The LanceHybridQueryBuilder object. """ - self._vector_query.nprobes(nprobes) + self._nprobes = nprobes return self def refine_factor(self, refine_factor: int) -> LanceHybridQueryBuilder: @@ -1150,7 +1130,15 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): LanceHybridQueryBuilder The LanceHybridQueryBuilder object. """ - self._vector_query.refine_factor(refine_factor) + self._refine_factor = refine_factor + return self + + def vector(self, vector: Union[np.ndarray, list]) -> LanceHybridQueryBuilder: + self._vector = vector + return self + + def text(self, text: str) -> LanceHybridQueryBuilder: + self._text = text return self diff --git a/python/python/tests/test_rerankers.py b/python/python/tests/test_rerankers.py index fca0850c..23132066 100644 --- a/python/python/tests/test_rerankers.py +++ b/python/python/tests/test_rerankers.py @@ -111,7 +111,9 @@ def _run_test_reranker(reranker, table, query, query_vector, schema): query_vector = table.to_pandas()["vector"][0] result = ( - table.search((query_vector, query), vector_column_name="vector") + table.search(query_type="hybrid", vector_column_name="vector") + .vector(query_vector) + .text(query) .limit(30) .rerank(reranker=reranker) .to_arrow() @@ -207,14 +209,26 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy): query = "Our father who art in heaven" query_vector = table.to_pandas()["vector"][0] result = ( - table.search((query_vector, query), vector_column_name="vector") + table.search(query_type="hybrid", vector_column_name="vector") + .vector(query_vector) + .text(query) .limit(30) .rerank(normalize="score") .to_arrow() ) - assert len(result) == 30 + # Fail if both query and (vector or text) are provided + with pytest.raises(ValueError): + table.search(query, query_type="hybrid", vector_column_name="vector").vector( + query_vector + ).to_arrow() + + with pytest.raises(ValueError): + table.search(query, query_type="hybrid", vector_column_name="vector").text( + query + ).to_arrow() + assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), ( "The _relevance_score column of the results returned by the reranker " "represents the relevance of the result to the query & should " From 38015ffa7cd533954f11e1cd6fcf7b3d820c5b4e Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Sat, 31 Aug 2024 03:48:11 +0530 Subject: [PATCH 31/34] docs: improve overall language on all example pages (#1582) Refine and improve the language clarity and quality across all example pages in the documentation to ensure better understanding and readability. --------- Co-authored-by: Ayush Chaurasia --- docs/src/examples/examples_python.md | 18 ++++++------ docs/src/examples/python_examples/aiagent.md | 10 +++---- .../python_examples/build_from_scratch.md | 4 +-- docs/src/examples/python_examples/chatbot.md | 16 +++++------ .../examples/python_examples/evaluations.md | 8 ++---- .../examples/python_examples/multimodal.md | 10 +++---- docs/src/examples/python_examples/rag.md | 11 ++++---- .../python_examples/recommendersystem.md | 6 ++-- .../examples/python_examples/vector_search.md | 28 +++++++++---------- 9 files changed, 54 insertions(+), 57 deletions(-) diff --git a/docs/src/examples/examples_python.md b/docs/src/examples/examples_python.md index 2c7d17d6..6ffe972f 100644 --- a/docs/src/examples/examples_python.md +++ b/docs/src/examples/examples_python.md @@ -10,13 +10,13 @@ Explore applied examples available as Colab notebooks or Python scripts to integ | Explore | Description | |----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Build from Scratch with LanceDB πŸ› οΈπŸš€](python_examples/build_from_scratch.md) | Start building your GenAI applications from the ground up using LanceDB's efficient vector-based document retrieval capabilities! Get started quickly with a solid foundation. | -| [Multimodal Search with LanceDB πŸ€Ήβ€β™‚οΈπŸ”](python_examples/multimodal.md) | Combine text and image queries to find the most relevant results using LanceDB’s multimodal capabilities. Leverage the efficient vector-based similarity search. | -| [RAG: Revolutionize Information Retrieval with LanceDB πŸ”“πŸ§](python_examples/rag.md) | Build RAG (Retrieval-Augmented Generation) with LanceDB for efficient vector-based information retrieval and more accurate responses from AI. | -| [Vector Search: Unlock Efficient Document Retrieval πŸ”“πŸ‘€](python_examples/vector_search.md) | Use LanceDB's vector search capabilities to perform efficient and accurate similarity searches, enabling rapid discovery and retrieval of relevant documents in Large datasets. | -| [Chatbot Application with LanceDB πŸ€–](python_examples/chatbot.md) | Create chatbots that retrieves relevant context for coherent and context-aware replies, enhancing user experience through advanced conversational AI. | -| [Evaluation: Assessing Text Performance with Precision πŸ“ŠπŸ’‘](python_examples/evaluations.md) | Develop evaluation applications that allows you to input reference and candidate texts to measure their performance across various metrics. | -| [AI Agents: Intelligent Collaboration πŸ€–](python_examples/aiagent.md) | Enable AI agents to communicate and collaborate efficiently through dense vector representations, achieving shared goals seamlessly. | -| [Recommender Systems: Personalized Discovery πŸΏπŸ“Ί](python_examples/recommendersystem.md) | Deliver personalized experiences by efficiently storing and querying item embeddings with LanceDB's powerful vector database capabilities. | -| **Miscellaneous Examples🌟** | Find other unique examples and creative solutions using LanceDB, showcasing the flexibility and broad applicability of the platform. | +| [**Build from Scratch with LanceDB** πŸ› οΈπŸš€](python_examples/build_from_scratch.md) | Start building your **GenAI applications** from the **ground up** using **LanceDB's** efficient vector-based document retrieval capabilities! Get started quickly with a solid foundation. | +| [**Multimodal Search with LanceDB** πŸ€Ήβ€β™‚οΈπŸ”](python_examples/multimodal.md) | Combine **text** and **image queries** to find the most relevant results using **LanceDB’s multimodal** capabilities. Leverage the efficient vector-based similarity search. | +| [**RAG (Retrieval-Augmented Generation) with LanceDB** πŸ”“πŸ§](python_examples/rag.md) | Build RAG (Retrieval-Augmented Generation) with **LanceDB** for efficient **vector-based information retrieval** and more accurate responses from AI. | +| [**Vector Search: Efficient Retrieval** πŸ”“πŸ‘€](python_examples/vector_search.md) | Use **LanceDB's** vector search capabilities to perform efficient and accurate **similarity searches**, enabling rapid discovery and retrieval of relevant documents in Large datasets. | +| [**Chatbot applications with LanceDB** πŸ€–](python_examples/chatbot.md) | Create **chatbots** that retrieves relevant context for **coherent and context-aware replies**, enhancing user experience through advanced conversational AI. | +| [**Evaluation: Assessing Text Performance with Precision** πŸ“ŠπŸ’‘](python_examples/evaluations.md) | Develop **evaluation** applications that allows you to input reference and candidate texts to **measure** their performance across various metrics. | +| [**AI Agents: Intelligent Collaboration** πŸ€–](python_examples/aiagent.md) | Enable **AI agents** to communicate and collaborate efficiently through dense vector representations, achieving shared goals seamlessly. | +| [**Recommender Systems: Personalized Discovery** πŸΏπŸ“Ί](python_examples/recommendersystem.md) | Deliver **personalized experiences** by efficiently storing and querying item embeddings with **LanceDB's** powerful vector database capabilities. | +| **Miscellaneous Examples🌟** | Find other **unique examples** and **creative solutions** using **LanceDB**, showcasing the flexibility and broad applicability of the platform. | diff --git a/docs/src/examples/python_examples/aiagent.md b/docs/src/examples/python_examples/aiagent.md index 12b624ae..bcb2eb20 100644 --- a/docs/src/examples/python_examples/aiagent.md +++ b/docs/src/examples/python_examples/aiagent.md @@ -1,15 +1,15 @@ # AI Agents: Intelligent CollaborationπŸ€– -Think of a platformπŸ’» where AI AgentsπŸ€– can seamlessly exchange information, coordinate over tasks, and achieve shared targets with great efficiencyπŸ“ˆπŸš€. +Think of a platform where AI Agents can seamlessly exchange information, coordinate over tasks, and achieve shared targets with great efficiencyπŸ’»πŸ“ˆ. ## Vector-Based Coordination: The Technical Advantage -Leveraging LanceDB's vector-based capabilities, our coordination application enables AI agents to communicate and collaborate through dense vector representations πŸ€–. AI agents can exchange information, coordinate on a task or work towards a common goal, just by giving queriesπŸ“. +Leveraging LanceDB's vector-based capabilities, we can enable **AI agents πŸ€–** to communicate and collaborate through dense vector representations. AI agents can exchange information, coordinate on a task or work towards a common goal, just by giving queriesπŸ“. | **AI Agents** | **Description** | **Links** | |:--------------|:----------------|:----------| -| **AI Agents: Reducing HallucinationtπŸ“Š** | πŸ€–πŸ’‘ Reduce AI hallucinations using Critique-Based Contexting! Learn by Simplifying and Automating tedious workflows by going through fitness trainer agent example.πŸ’ͺ | [![Github](../../assets/github.svg)][hullucination_github]
[![Open In Collab](../../assets/colab.svg)][hullucination_colab]
[![Python](../../assets/python.svg)][hullucination_python]
[![Ghost](../../assets/ghost.svg)][hullucination_ghost] | -| **AI Trends Searcher: CrewAIπŸ”οΈ** | πŸ”οΈ Learn about CrewAI Agents ! Utilize the features of CrewAI - Role-based Agents, Task Management, and Inter-agent Delegation ! Make AI agents work together to do tricky stuff 😺| [![Github](../../assets/github.svg)][trend_github]
[![Open In Collab](../../assets/colab.svg)][trend_colab]
[![Ghost](../../assets/ghost.svg)][trend_ghost] | -| **SuperAgent AutogenπŸ€–** | πŸ’» AI interactions with the Super Agent! Integrating Autogen, LanceDB, LangChain, LiteLLM, and Ollama to create AI agent that excels in understanding and processing complex queries.πŸ€– | [![Github](../../assets/github.svg)][superagent_github]
[![Open In Collab](../../assets/colab.svg)][superagent_colab] | +| **AI Agents: Reducing HallucinationtπŸ“Š** | πŸ€–πŸ’‘ **Reduce AI hallucinations** using Critique-Based Contexting! Learn by Simplifying and Automating tedious workflows by going through fitness trainer agent example.πŸ’ͺ | [![Github](../../assets/github.svg)][hullucination_github]
[![Open In Collab](../../assets/colab.svg)][hullucination_colab]
[![Python](../../assets/python.svg)][hullucination_python]
[![Ghost](../../assets/ghost.svg)][hullucination_ghost] | +| **AI Trends Searcher: CrewAIπŸ”οΈ** | πŸ”οΈ Learn about **CrewAI Agents** ! Utilize the features of CrewAI - Role-based Agents, Task Management, and Inter-agent Delegation ! Make AI agents work together to do tricky stuff 😺| [![Github](../../assets/github.svg)][trend_github]
[![Open In Collab](../../assets/colab.svg)][trend_colab]
[![Ghost](../../assets/ghost.svg)][trend_ghost] | +| **SuperAgent AutogenπŸ€–** | πŸ’» AI interactions with the Super Agent! Integrating **Autogen**, **LanceDB**, **LangChain**, **LiteLLM**, and **Ollama** to create AI agent that excels in understanding and processing complex queries.πŸ€– | [![Github](../../assets/github.svg)][superagent_github]
[![Open In Collab](../../assets/colab.svg)][superagent_colab] | [hullucination_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/reducing_hallucinations_ai_agents diff --git a/docs/src/examples/python_examples/build_from_scratch.md b/docs/src/examples/python_examples/build_from_scratch.md index 65e21af4..7019a810 100644 --- a/docs/src/examples/python_examples/build_from_scratch.md +++ b/docs/src/examples/python_examples/build_from_scratch.md @@ -1,10 +1,10 @@ # **Build from Scratch with LanceDB πŸ› οΈπŸš€** -Start building your GenAI applications from the ground up using LanceDB's efficient vector-based document retrieval capabilities! πŸ“‘ +Start building your GenAI applications from the ground up using **LanceDB's** efficient vector-based document retrieval capabilities! πŸ“‘ **Get Started in Minutes ⏱️** -These examples provide a solid foundation for building your own GenAI applications using LanceDB. Jump from idea to proof of concept quickly with applied examples. Get started and see what you can create! πŸ’» +These examples provide a solid foundation for building your own GenAI applications using LanceDB. Jump from idea to **proof of concept** quickly with applied examples. Get started and see what you can create! πŸ’» | **Build From Scratch** | **Description** | **Links** | |:-------------------------------------------|:-------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/docs/src/examples/python_examples/chatbot.md b/docs/src/examples/python_examples/chatbot.md index a16848a6..6d1e59cd 100644 --- a/docs/src/examples/python_examples/chatbot.md +++ b/docs/src/examples/python_examples/chatbot.md @@ -1,7 +1,7 @@ -**Chatbot Application with LanceDB πŸ€–** +**Chatbot applications with LanceDB πŸ€–** ==================================================================== - Create an innovative chatbot application that utilizes LanceDB for efficient vector-based response generation! 🌐✨ + Create innovative chatbot applications that utilizes LanceDB for efficient vector-based response generation! 🌐✨ **Introduction πŸ‘‹βœ¨** @@ -10,12 +10,12 @@ | **Chatbot** | **Description** | **Links** | |:----------------|:-----------------|:-----------| -| **Databricks DBRX Website Bot ⚑️** | Unlock magical conversations with the Hogwarts chatbot, powered by Open-source RAG, DBRX, LanceDB, LLama-index, and Hugging Face Embeddings, delivering enchanting user experiences and spellbinding interactions ✨ | [![GitHub](../../assets/github.svg)][databricks_github]
[![Python](../../assets/python.svg)][databricks_python] | -| **CLI SDK Manual Chatbot Locally πŸ’»** | CLI chatbot for SDK/hardware documents, powered by Local RAG, LLama3, Ollama, LanceDB, and Openhermes Embeddings, built with Phidata Assistant and Knowledge Base for instant technical support πŸ€– | [![GitHub](../../assets/github.svg)][clisdk_github]
[![Python](../../assets/python.svg)][clisdk_python] | -| **Youtube Transcript Search QA Bot πŸ“Ή** | Unlock the power of YouTube transcripts with a Q&A bot, leveraging natural language search and LanceDB for effortless data management and instant answers πŸ’¬ | [![GitHub](../../assets/github.svg)][youtube_github]
[![Open In Collab](../../assets/colab.svg)][youtube_colab]
[![Python](../../assets/python.svg)][youtube_python] | -| **Code Documentation Q&A Bot with LangChain πŸ€–** | Revolutionize code documentation with a Q&A bot, powered by LangChain and LanceDB, allowing effortless querying of documentation using natural language, demonstrated with Numpy 1.26 docs πŸ“š | [![GitHub](../../assets/github.svg)][docs_github]
[![Open In Collab](../../assets/colab.svg)][docs_colab]
[![Python](../../assets/python.svg)][docs_python] | -| **Context-aware Chatbot using Llama 2 & LanceDB πŸ€–** | Experience the future of conversational AI with a context-aware chatbot, powered by Llama 2, LanceDB, and LangChain, enabling intuitive and meaningful conversations with your data πŸ“šπŸ’¬ | [![GitHub](../../assets/github.svg)][aware_github]
[![Open In Collab](../../assets/colab.svg)][aware_colab]
[![Ghost](../../assets/ghost.svg)][aware_ghost] | -| **Chat with csv using Hybrid Search πŸ“Š** | Revolutionize data interaction with a chat application that harnesses LanceDB's hybrid search capabilities to converse with CSV and Excel files, enabling efficient and scalable data exploration and analysis πŸš€ | [![GitHub](../../assets/github.svg)][csv_github]
[![Open In Collab](../../assets/colab.svg)][csv_colab]
[![Ghost](../../assets/ghost.svg)][csv_ghost] | +| **Databricks DBRX Website Bot ⚑️** | Engage with the **Hogwarts chatbot**, that uses Open-source RAG with **DBRX**, **LanceDB** and **LLama-index with Hugging Face Embeddings**, to provide interactive and engaging user experiences. ✨ | [![GitHub](../../assets/github.svg)][databricks_github]
[![Python](../../assets/python.svg)][databricks_python] | +| **CLI SDK Manual Chatbot Locally πŸ’»** | CLI chatbot for SDK/hardware documents using **Local RAG** with **LLama3**, **Ollama**, **LanceDB**, and **Openhermes Embeddings**, built with **Phidata** Assistant and Knowledge Base πŸ€– | [![GitHub](../../assets/github.svg)][clisdk_github]
[![Python](../../assets/python.svg)][clisdk_python] | +| **Youtube Transcript Search QA Bot πŸ“Ή** | Search through **youtube transcripts** using natural language with a Q&A bot, leveraging **LanceDB** for effortless data storage and management πŸ’¬ | [![GitHub](../../assets/github.svg)][youtube_github]
[![Open In Collab](../../assets/colab.svg)][youtube_colab]
[![Python](../../assets/python.svg)][youtube_python] | +| **Code Documentation Q&A Bot with LangChain πŸ€–** | Query your own documentation easily using questions in natural language with a Q&A bot, powered by **LangChain** and **LanceDB**, demonstrated with **Numpy 1.26 docs** πŸ“š | [![GitHub](../../assets/github.svg)][docs_github]
[![Open In Collab](../../assets/colab.svg)][docs_colab]
[![Python](../../assets/python.svg)][docs_python] | +| **Context-aware Chatbot using Llama 2 & LanceDB πŸ€–** | Build **conversational AI** with a **context-aware chatbot**, powered by **Llama 2**, **LanceDB**, and **LangChain**, that enables intuitive and meaningful conversations with your data πŸ“šπŸ’¬ | [![GitHub](../../assets/github.svg)][aware_github]
[![Open In Collab](../../assets/colab.svg)][aware_colab]
[![Ghost](../../assets/ghost.svg)][aware_ghost] | +| **Chat with csv using Hybrid Search πŸ“Š** | **Chat** application that interacts with **CSV** and **Excel files** using **LanceDB’s** hybrid search capabilities, performing direct operations on large-scale columnar data efficiently πŸš€ | [![GitHub](../../assets/github.svg)][csv_github]
[![Open In Collab](../../assets/colab.svg)][csv_colab]
[![Ghost](../../assets/ghost.svg)][csv_ghost] | [databricks_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/databricks_DBRX_website_bot diff --git a/docs/src/examples/python_examples/evaluations.md b/docs/src/examples/python_examples/evaluations.md index 9ee1a10a..18d0ccba 100644 --- a/docs/src/examples/python_examples/evaluations.md +++ b/docs/src/examples/python_examples/evaluations.md @@ -1,18 +1,16 @@ **Evaluation: Assessing Text Performance with Precision πŸ“ŠπŸ’‘** ==================================================================== -**Evaluation Fundamentals πŸ“Š** - Evaluation is a comprehensive tool designed to measure the performance of text-based inputs, enabling data-driven optimization and improvement πŸ“ˆ. **Text Evaluation 101 πŸ“š** -By leveraging cutting-edge technologies, this provides a robust framework for evaluating reference and candidate texts across various metrics πŸ“Š, ensuring high-quality text outputs that meet specific requirements and standards πŸ“. +Using robust framework for assessing reference and candidate texts across various metricsπŸ“Š, ensure that the text outputs are high-quality and meet specific requirements and standardsπŸ“. | **Evaluation** | **Description** | **Links** | | -------------- | --------------- | --------- | -| **Evaluating Prompts with Prompttools πŸ€–** | Compare, visualize & evaluate embedding functions (incl. OpenAI) across metrics like latency & custom evaluation πŸ“ˆπŸ“Š | [![Github](../../assets/github.svg)][prompttools_github]
[![Open In Collab](../../assets/colab.svg)][prompttools_colab] | -| **Evaluating RAG with RAGAs and GPT-4o πŸ“Š** | Evaluate RAG pipelines with cutting-edge metrics and tools, integrate with CI/CD for continuous performance checks, and generate responses with GPT-4o πŸ€–πŸ“ˆ | [![Github](../../assets/github.svg)][RAGAs_github]
[![Open In Collab](../../assets/colab.svg)][RAGAs_colab] | +| **Evaluating Prompts with Prompttools πŸ€–** | Compare, visualize & evaluate **embedding functions** (incl. OpenAI) across metrics like latency & custom evaluation πŸ“ˆπŸ“Š | [![Github](../../assets/github.svg)][prompttools_github]
[![Open In Collab](../../assets/colab.svg)][prompttools_colab] | +| **Evaluating RAG with RAGAs and GPT-4o πŸ“Š** | Evaluate **RAG pipelines** with cutting-edge metrics and tools, integrate with CI/CD for continuous performance checks, and generate responses with GPT-4o πŸ€–πŸ“ˆ | [![Github](../../assets/github.svg)][RAGAs_github]
[![Open In Collab](../../assets/colab.svg)][RAGAs_colab] | diff --git a/docs/src/examples/python_examples/multimodal.md b/docs/src/examples/python_examples/multimodal.md index 28ddce00..08c561c6 100644 --- a/docs/src/examples/python_examples/multimodal.md +++ b/docs/src/examples/python_examples/multimodal.md @@ -1,6 +1,6 @@ # **Multimodal Search with LanceDB πŸ€Ήβ€β™‚οΈπŸ”** -Experience the future of search with LanceDB's multimodal capabilities. Combine text and image queries to find the most relevant results in your corpus ! πŸ”“πŸ’‘ +Using LanceDB's multimodal capabilities, combine text and image queries to find the most relevant results in your corpus ! πŸ”“πŸ’‘ **Explore the Future of Search πŸš€** @@ -10,10 +10,10 @@ LanceDB supports multimodal search by indexing and querying vector representatio | **Multimodal** | **Description** | **Links** | |:----------------|:-----------------|:-----------| -| **Multimodal CLIP: DiffusionDB 🌐πŸ’₯** | Revolutionize search with Multimodal CLIP and DiffusionDB, combining text and image understanding for a new dimension of discovery! πŸ”“ | [![GitHub](../../assets/github.svg)][Clip_diffusionDB_github]
[![Open In Collab](../../assets/colab.svg)][Clip_diffusionDB_colab]
[![Python](../../assets/python.svg)][Clip_diffusionDB_python]
[![Ghost](../../assets/ghost.svg)][Clip_diffusionDB_ghost] | -| **Multimodal CLIP: Youtube Videos πŸ“ΉπŸ‘€** | Search Youtube videos using Multimodal CLIP, finding relevant content with ease and accuracy! 🎯 | [![Github](../../assets/github.svg)][Clip_youtube_github]
[![Open In Collab](../../assets/colab.svg)][Clip_youtube_colab]
[![Python](../../assets/python.svg)][Clip_youtube_python]
[![Ghost](../../assets/ghost.svg)][Clip_youtube_python] | -| **Multimodal Image + Text Search πŸ“ΈπŸ”** | Discover relevant documents and images with a single query, using LanceDB's multimodal search capabilities to bridge the gap between text and visuals! πŸŒ‰ | [![GitHub](../../assets/github.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search)
[![Open In Collab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.ipynb)
[![Python](../../assets/python.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.py)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/) | -| **Cambrian-1: Vision-Centric Image Exploration πŸ”πŸ‘€** | Dive into vision-centric exploration of images with Cambrian-1, powered by LanceDB's multimodal search to uncover new insights! πŸ”Ž | [![Kaggle](https://img.shields.io/badge/Kaggle-035a7d?style=for-the-badge&logo=kaggle&logoColor=white)](https://www.kaggle.com/code/prasantdixit/cambrian-1-vision-centric-exploration-of-images/)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/cambrian-1-vision-centric-exploration/) | +| **Multimodal CLIP: DiffusionDB 🌐πŸ’₯** | Multi-Modal Search with **CLIP** and **LanceDB** Using **DiffusionDB** Data for Combined Text and Image Understanding ! πŸ”“ | [![GitHub](../../assets/github.svg)][Clip_diffusionDB_github]
[![Open In Collab](../../assets/colab.svg)][Clip_diffusionDB_colab]
[![Python](../../assets/python.svg)][Clip_diffusionDB_python]
[![Ghost](../../assets/ghost.svg)][Clip_diffusionDB_ghost] | +| **Multimodal CLIP: Youtube Videos πŸ“ΉπŸ‘€** | Search **Youtube videos** using Multimodal CLIP, finding relevant content with ease and accuracy! 🎯 | [![Github](../../assets/github.svg)][Clip_youtube_github]
[![Open In Collab](../../assets/colab.svg)][Clip_youtube_colab]
[![Python](../../assets/python.svg)][Clip_youtube_python]
[![Ghost](../../assets/ghost.svg)][Clip_youtube_python] | +| **Multimodal Image + Text Search πŸ“ΈπŸ”** | Find **relevant documents** and **images** with a single query using **LanceDB's** multimodal search capabilities, to seamlessly integrate text and visuals ! πŸŒ‰ | [![GitHub](../../assets/github.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search)
[![Open In Collab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.ipynb)
[![Python](../../assets/python.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.py)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/) | +| **Cambrian-1: Vision-Centric Image Exploration πŸ”πŸ‘€** | Learn how **Cambrian-1** works, using an example of **Vision-Centric** exploration on images found through vector search ! Work on **Flickr-8k** dataset πŸ”Ž | [![Kaggle](https://img.shields.io/badge/Kaggle-035a7d?style=for-the-badge&logo=kaggle&logoColor=white)](https://www.kaggle.com/code/prasantdixit/cambrian-1-vision-centric-exploration-of-images/)
[![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/cambrian-1-vision-centric-exploration/) | [Clip_diffusionDB_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_clip_diffusiondb diff --git a/docs/src/examples/python_examples/rag.md b/docs/src/examples/python_examples/rag.md index 3d9f89fa..a6db3a68 100644 --- a/docs/src/examples/python_examples/rag.md +++ b/docs/src/examples/python_examples/rag.md @@ -1,5 +1,4 @@ - -**RAG: Revolutionize Information Retrieval with LanceDB πŸ”“πŸ§** +**RAG (Retrieval-Augmented Generation) with LanceDB πŸ”“πŸ§** ==================================================================== Build RAG (Retrieval-Augmented Generation) with LanceDB, a powerful solution for efficient vector-based information retrieval πŸ“Š. @@ -18,10 +17,10 @@ Build RAG (Retrieval-Augmented Generation) with LanceDB, a powerful solution fo | **Advanced RAG: Parent Document Retriever** πŸ“‘πŸ”— | Use **Parent Document & Bigger Chunk Retriever** to maintain context and relevance when generating related content. πŸŽ΅πŸ“„ | [![Github](../../assets/github.svg)][parent_doc_retriever_github]
[![Open In Collab](../../assets/colab.svg)][parent_doc_retriever_colab]
[![Ghost](../../assets/ghost.svg)][parent_doc_retriever_ghost] | | **Corrective RAG with Langgraph** πŸ”§πŸ“Š | Enhance RAG reliability with **Corrective RAG (CRAG)** by self-reflecting and fact-checking for accurate and trustworthy results. βœ…πŸ” |[![Github](../../assets/github.svg)][corrective_rag_github]
[![Open In Collab](../../assets/colab.svg)][corrective_rag_colab]
[![Ghost](../../assets/ghost.svg)][corrective_rag_ghost] | | **Contextual Compression with RAG** πŸ—œοΈπŸ§  | Apply **contextual compression techniques** to condense large documents while retaining essential information. πŸ“„πŸ—œοΈ | [![Github](../../assets/github.svg)][compression_rag_github]
[![Open In Collab](../../assets/colab.svg)][compression_rag_colab]
[![Ghost](../../assets/ghost.svg)][compression_rag_ghost] | -| **Improve RAG with FLARE** πŸ”₯| Enable users to ask questions directly to academic papers, focusing on ArXiv papers, with Forward-Looking Active REtrieval augmented generation.πŸš€πŸŒŸ | [![Github](../../assets/github.svg)][flare_github]
[![Open In Collab](../../assets/colab.svg)][flare_colab]
[![Ghost](../../assets/ghost.svg)][flare_ghost] | -| **Query Expansion and Reranker** πŸ”πŸ”„ | Enhance RAG with query expansion using Large Language Models and advanced **reranking methods** like Cross Encoders, ColBERT v2, and FlashRank for improved document retrieval precision and recall πŸ”πŸ“ˆ | [![Github](../../assets/github.svg)][query_github]
[![Open In Collab](../../assets/colab.svg)][query_colab] | -| **RAG Fusion** ⚑🌐 | Revolutionize search with RAG Fusion, utilizing the **RRF algorithm** to rerank documents based on user queries, and leveraging LanceDB and OPENAI Embeddings for efficient information retrieval ⚑🌐 | [![Github](../../assets/github.svg)][fusion_github]
[![Open In Collab](../../assets/colab.svg)][fusion_colab] | -| **Agentic RAG** πŸ€–πŸ“š | Unlock autonomous information retrieval with **Agentic RAG**, a framework of **intelligent agents** that collaborate to synthesize, summarize, and compare data across sources, enabling proactive and informed decision-making πŸ€–πŸ“š | [![Github](../../assets/github.svg)][agentic_github]
[![Open In Collab](../../assets/colab.svg)][agentic_colab] | +| **Improve RAG with FLARE** πŸ”₯| Enable users to ask questions directly to **academic papers**, focusing on **ArXiv papers**, with **F**orward-**L**ooking **A**ctive **RE**trieval augmented generation.πŸš€πŸŒŸ | [![Github](../../assets/github.svg)][flare_github]
[![Open In Collab](../../assets/colab.svg)][flare_colab]
[![Ghost](../../assets/ghost.svg)][flare_ghost] | +| **Query Expansion and Reranker** πŸ”πŸ”„ | Enhance RAG with query expansion using Large Language Models and advanced **reranking methods** like **Cross Encoders**, **ColBERT v2**, and **FlashRank** for improved document retrieval precision and recall πŸ”πŸ“ˆ | [![Github](../../assets/github.svg)][query_github]
[![Open In Collab](../../assets/colab.svg)][query_colab] | +| **RAG Fusion** ⚑🌐 | Build RAG Fusion, utilize the **RRF algorithm** to rerank documents based on user queries ! Use **LanceDB** as vector database to store and retrieve documents related to queries via **OPENAI Embeddings**⚑🌐 | [![Github](../../assets/github.svg)][fusion_github]
[![Open In Collab](../../assets/colab.svg)][fusion_colab] | +| **Agentic RAG** πŸ€–πŸ“š | Build autonomous information retrieval with **Agentic RAG**, a framework of **intelligent agents** that collaborate to synthesize, summarize, and compare data across sources, that enables proactive and informed decision-making πŸ€–πŸ“š | [![Github](../../assets/github.svg)][agentic_github]
[![Open In Collab](../../assets/colab.svg)][agentic_colab] | diff --git a/docs/src/examples/python_examples/recommendersystem.md b/docs/src/examples/python_examples/recommendersystem.md index ab7e4064..12ce7780 100644 --- a/docs/src/examples/python_examples/recommendersystem.md +++ b/docs/src/examples/python_examples/recommendersystem.md @@ -9,10 +9,10 @@ Deliver personalized experiences with Recommender Systems. 🎁 | **Recommender System** | **Description** | **Links** | | ---------------------- | --------------- | --------- | | **Movie Recommender System🎬** | 🀝 Use **collaborative filtering** to predict user preferences, assuming similar users will like similar movies, and leverage **Singular Value Decomposition** (SVD) from Numpy for precise matrix factorization and accurate recommendationsπŸ“Š | [![Github](../../assets/github.svg)][movie_github]
[![Open In Collab](../../assets/colab.svg)][movie_colab]
[![Python](../../assets/python.svg)][movie_python] | -| **πŸŽ₯ Movie Recommendation with Genres** | πŸ” Creates movie embeddings using Doc2Vec, capturing genre and characteristic nuances, and leverages VectorDB for efficient storage and querying, enabling accurate genre classification and personalized movie recommendations through similarity searchesπŸŽ₯ | [![Github](../../assets/github.svg)][genre_github]
[![Open In Collab](../../assets/colab.svg)][genre_colab]
[![Ghost](../../assets/ghost.svg)][genre_ghost] | +| **πŸŽ₯ Movie Recommendation with Genres** | πŸ” Creates movie embeddings using **Doc2Vec**, capturing genre and characteristic nuances, and leverages VectorDB for efficient storage and querying, enabling accurate genre classification and personalized movie recommendations through **similarity searches**πŸŽ₯ | [![Github](../../assets/github.svg)][genre_github]
[![Open In Collab](../../assets/colab.svg)][genre_colab]
[![Ghost](../../assets/ghost.svg)][genre_ghost] | | **πŸ›οΈ Product Recommender using Collaborative Filtering and LanceDB** | πŸ“ˆ Using **Collaborative Filtering** and **LanceDB** to analyze your past purchases, recommends products based on user's past purchases. Demonstrated with the Instacart dataset in our exampleπŸ›’ | [![Github](../../assets/github.svg)][product_github]
[![Open In Collab](../../assets/colab.svg)][product_colab]
[![Python](../../assets/python.svg)][product_python] | -| **πŸ” Arxiv Search with OpenCLIP and LanceDB** | πŸ’‘ Build a semantic search engine for Arxiv papers using LanceDB, and benchmarks its performance against traditional keyword-based search on Nomic's Atlas, to demonstrate the power of semantic search in finding relevant research papersπŸ“š | [![Github](../../assets/github.svg)][arxiv_github]
[![Open In Collab](../../assets/colab.svg)][arxiv_colab]
[![Python](../../assets/python.svg)][arxiv_python] | -| **Food Recommendation System🍴** | πŸ” Build a food recommendation system with LanceDB, featuring vector-based recommendations, full-text search, hybrid search, and reranking model integration for personalized and accurate food suggestionsπŸ‘Œ | [![Github](../../assets/github.svg)][food_github]
[![Open In Collab](../../assets/colab.svg)][food_colab] | +| **πŸ” Arxiv Search with OpenCLIP and LanceDB** | πŸ’‘ Build a semantic search engine for **Arxiv papers** using **LanceDB**, and benchmarks its performance against traditional keyword-based search on **Nomic's Atlas**, to demonstrate the power of semantic search in finding relevant research papersπŸ“š | [![Github](../../assets/github.svg)][arxiv_github]
[![Open In Collab](../../assets/colab.svg)][arxiv_colab]
[![Python](../../assets/python.svg)][arxiv_python] | +| **Food Recommendation System🍴** | πŸ” Build a food recommendation system with **LanceDB**, featuring vector-based recommendations, full-text search, hybrid search, and reranking model integration for personalized and accurate food suggestionsπŸ‘Œ | [![Github](../../assets/github.svg)][food_github]
[![Open In Collab](../../assets/colab.svg)][food_colab] | [movie_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommender [movie_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/movie-recommender/main.ipynb diff --git a/docs/src/examples/python_examples/vector_search.md b/docs/src/examples/python_examples/vector_search.md index 7182eb09..8561f716 100644 --- a/docs/src/examples/python_examples/vector_search.md +++ b/docs/src/examples/python_examples/vector_search.md @@ -1,4 +1,4 @@ -**Vector Search: Unlock Efficient Document Retrieval πŸ”“πŸ‘€** +**Vector Search: Efficient Retrieval πŸ”“πŸ‘€** ==================================================================== Vector search with LanceDB, is a solution for efficient and accurate similarity searches in large datasets πŸ“Š. @@ -9,19 +9,19 @@ LanceDB implements vector search algorithms for efficient document retrieval and | **Vector Search** | **Description** | **Links** | |:-----------------|:---------------|:---------| -| **Inbuilt Hybrid Search πŸ”„** | Combine the power of traditional search algorithms with LanceDB's vector-based search for a robust and efficient search experience πŸ“Š | [![Github](../../assets/github.svg)][inbuilt_hybrid_search_github]
[![Open In Collab](../../assets/colab.svg)][inbuilt_hybrid_search_colab] | -| **Hybrid Search with BM25 and LanceDB πŸ’‘** | Synergizes BM25's keyword-focused precision (term frequency, document length normalization, bias-free retrieval) with LanceDB's semantic understanding (contextual analysis, query intent alignment) for nuanced search results in complex datasets πŸ“ˆ | [![Github](../../assets/github.svg)][BM25_github]
[![Open In Collab](../../assets/colab.svg)][BM25_colab]
[![Ghost](../../assets/ghost.svg)][BM25_ghost] | -| **NER-powered Semantic Search πŸ”Ž** | Unlock contextual understanding with Named Entity Recognition (NER) methods: Dictionary-Based, Rule-Based, and Deep Learning-Based, to accurately identify and extract entities, enabling precise semantic search results πŸ—‚οΈ | [![Github](../../assets/github.svg)][NER_github]
[![Open In Collab](../../assets/colab.svg)][NER_colab]
[![Ghost](../../assets/ghost.svg)][NER_ghost]| -| **Audio Similarity Search using Vector Embeddings 🎡** | Create vector embeddings of audio files to find similar audio content, enabling efficient audio similarity search and retrieval in LanceDB's vector store πŸ“» |[![Github](../../assets/github.svg)][audio_search_github]
[![Open In Collab](../../assets/colab.svg)][audio_search_colab]
[![Python](../../assets/python.svg)][audio_search_python]| -| **LanceDB Embeddings API: Multi-lingual Semantic Search 🌎** | Build a universal semantic search table with LanceDB's Embeddings API, supporting multiple languages (e.g., English, French) using cohere's multi-lingual model, for accurate cross-lingual search results πŸ“„ | [![Github](../../assets/github.svg)][mls_github]
[![Open In Collab](../../assets/colab.svg)][mls_colab]
[![Python](../../assets/python.svg)][mls_python] | -| **Facial Recognition: Face Embeddings πŸ€–** | Detect, crop, and embed faces using Facenet, then store and query face embeddings in LanceDB for efficient facial recognition and top-K matching results πŸ‘₯ | [![Github](../../assets/github.svg)][fr_github]
[![Open In Collab](../../assets/colab.svg)][fr_colab] | -| **Sentiment Analysis: Hotel Reviews 🏨** | Analyze customer sentiments towards the hotel industry using BERT models, storing sentiment labels, scores, and embeddings in LanceDB, enabling queries on customer opinions and potential areas for improvement πŸ’¬ | [![Github](../../assets/github.svg)][sentiment_analysis_github]
[![Open In Collab](../../assets/colab.svg)][sentiment_analysis_colab]
[![Ghost](../../assets/ghost.svg)][sentiment_analysis_ghost] | -| **Vector Arithmetic with LanceDB βš–οΈ** | Unlock powerful semantic search capabilities by performing vector arithmetic on embeddings, enabling complex relationships and nuances in data to be captured, and simplifying the process of retrieving semantically similar results πŸ“Š | [![Github](../../assets/github.svg)][arithmetic_github]
[![Open In Collab](../../assets/colab.svg)][arithmetic_colab]
[![Ghost](../../assets/ghost.svg)][arithmetic_ghost] | -| **Imagebind Demo πŸ–ΌοΈ** | Explore the multi-modal capabilities of Imagebind through a Gradio app, leveraging LanceDB API for seamless image search and retrieval experiences πŸ“Έ | [![Github](../../assets/github.svg)][imagebind_github]
[![Open in Spaces](../../assets/open_hf_space.svg)][imagebind_huggingface] | -| **Search Engine using SAM & CLIP πŸ”** | Build a search engine within an image using SAM and CLIP models, enabling object-level search and retrieval, with LanceDB indexing and search capabilities to find the closest match between image embeddings and user queries πŸ“Έ | [![Github](../../assets/github.svg)][swi_github]
[![Open In Collab](../../assets/colab.svg)][swi_colab]
[![Ghost](../../assets/ghost.svg)][swi_ghost] | -| **Zero Shot Object Localization and Detection with CLIP πŸ”Ž** | Perform object detection on images using OpenAI's CLIP, enabling zero-shot localization and detection of objects, with capabilities to split images into patches, parse with CLIP, and plot bounding boxes πŸ“Š | [![Github](../../assets/github.svg)][zsod_github]
[![Open In Collab](../../assets/colab.svg)][zsod_colab] | -| **Accelerate Vector Search with OpenVINO πŸš€** | Boost vector search applications using OpenVINO, achieving significant speedups with CLIP for text-to-image and image-to-image searching, through PyTorch model optimization, FP16 and INT8 format conversion, and quantization with OpenVINO NNCF πŸ“ˆ | [![Github](../../assets/github.svg)][openvino_github]
[![Open In Collab](../../assets/colab.svg)][openvino_colab]
[![Ghost](../../assets/ghost.svg)][openvino_ghost] | -| **Zero-Shot Image Classification with CLIP and LanceDB πŸ“Έ** | Achieve zero-shot image classification using CLIP and LanceDB, enabling models to classify images without prior training on specific use cases, unlocking flexible and adaptable image classification capabilities πŸ”“ | [![Github](../../assets/github.svg)][zsic_github]
[![Open In Collab](../../assets/colab.svg)][zsic_colab]
[![Ghost](../../assets/ghost.svg)][zsic_ghost] | +| **Inbuilt Hybrid Search πŸ”„** | Perform hybrid search in **LanceDB** by combining the results of semantic and full-text search via a reranking algorithm of your choice πŸ“Š | [![Github](../../assets/github.svg)][inbuilt_hybrid_search_github]
[![Open In Collab](../../assets/colab.svg)][inbuilt_hybrid_search_colab] | +| **Hybrid Search with BM25 and LanceDB πŸ’‘** | Use **Synergizes BM25's** keyword-focused precision (term frequency, document length normalization, bias-free retrieval) with **LanceDB's** semantic understanding (contextual analysis, query intent alignment) for nuanced search results in complex datasets πŸ“ˆ | [![Github](../../assets/github.svg)][BM25_github]
[![Open In Collab](../../assets/colab.svg)][BM25_colab]
[![Ghost](../../assets/ghost.svg)][BM25_ghost] | +| **NER-powered Semantic Search πŸ”Ž** | Extract and identify essential information from text with Named Entity Recognition **(NER)** methods: Dictionary-Based, Rule-Based, and Deep Learning-Based, to accurately extract and categorize entities, enabling precise semantic search results πŸ—‚οΈ | [![Github](../../assets/github.svg)][NER_github]
[![Open In Collab](../../assets/colab.svg)][NER_colab]
[![Ghost](../../assets/ghost.svg)][NER_ghost]| +| **Audio Similarity Search using Vector Embeddings 🎡** | Create vector **embeddings of audio files** to find similar audio content, enabling efficient audio similarity search and retrieval in **LanceDB's** vector store πŸ“» |[![Github](../../assets/github.svg)][audio_search_github]
[![Open In Collab](../../assets/colab.svg)][audio_search_colab]
[![Python](../../assets/python.svg)][audio_search_python]| +| **LanceDB Embeddings API: Multi-lingual Semantic Search 🌎** | Build a universal semantic search table with **LanceDB's Embeddings API**, supporting multiple languages (e.g., English, French) using **cohere's** multi-lingual model, for accurate cross-lingual search results πŸ“„ | [![Github](../../assets/github.svg)][mls_github]
[![Open In Collab](../../assets/colab.svg)][mls_colab]
[![Python](../../assets/python.svg)][mls_python] | +| **Facial Recognition: Face Embeddings πŸ€–** | Detect, crop, and embed faces using Facenet, then store and query face embeddings in **LanceDB** for efficient facial recognition and top-K matching results πŸ‘₯ | [![Github](../../assets/github.svg)][fr_github]
[![Open In Collab](../../assets/colab.svg)][fr_colab] | +| **Sentiment Analysis: Hotel Reviews 🏨** | Analyze customer sentiments towards the hotel industry using **BERT models**, storing sentiment labels, scores, and embeddings in **LanceDB**, enabling queries on customer opinions and potential areas for improvement πŸ’¬ | [![Github](../../assets/github.svg)][sentiment_analysis_github]
[![Open In Collab](../../assets/colab.svg)][sentiment_analysis_colab]
[![Ghost](../../assets/ghost.svg)][sentiment_analysis_ghost] | +| **Vector Arithmetic with LanceDB βš–οΈ** | Perform **vector arithmetic** on embeddings, enabling complex relationships and nuances in data to be captured, and simplifying the process of retrieving semantically similar results πŸ“Š | [![Github](../../assets/github.svg)][arithmetic_github]
[![Open In Collab](../../assets/colab.svg)][arithmetic_colab]
[![Ghost](../../assets/ghost.svg)][arithmetic_ghost] | +| **Imagebind Demo πŸ–ΌοΈ** | Explore the multi-modal capabilities of **Imagebind** through a Gradio app, use **LanceDB API** for seamless image search and retrieval experiences πŸ“Έ | [![Github](../../assets/github.svg)][imagebind_github]
[![Open in Spaces](../../assets/open_hf_space.svg)][imagebind_huggingface] | +| **Search Engine using SAM & CLIP πŸ”** | Build a search engine within an image using **SAM** and **CLIP** models, enabling object-level search and retrieval, with LanceDB indexing and search capabilities to find the closest match between image embeddings and user queries πŸ“Έ | [![Github](../../assets/github.svg)][swi_github]
[![Open In Collab](../../assets/colab.svg)][swi_colab]
[![Ghost](../../assets/ghost.svg)][swi_ghost] | +| **Zero Shot Object Localization and Detection with CLIP πŸ”Ž** | Perform object detection on images using **OpenAI's CLIP**, enabling zero-shot localization and detection of objects, with capabilities to split images into patches, parse with CLIP, and plot bounding boxes πŸ“Š | [![Github](../../assets/github.svg)][zsod_github]
[![Open In Collab](../../assets/colab.svg)][zsod_colab] | +| **Accelerate Vector Search with OpenVINO πŸš€** | Boost vector search applications using **OpenVINO**, achieving significant speedups with **CLIP** for text-to-image and image-to-image searching, through PyTorch model optimization, FP16 and INT8 format conversion, and quantization with **OpenVINO NNCF** πŸ“ˆ | [![Github](../../assets/github.svg)][openvino_github]
[![Open In Collab](../../assets/colab.svg)][openvino_colab]
[![Ghost](../../assets/ghost.svg)][openvino_ghost] | +| **Zero-Shot Image Classification with CLIP and LanceDB πŸ“Έ** | Achieve zero-shot image classification using **CLIP** and **LanceDB**, enabling models to classify images without prior training on specific use cases, unlocking flexible and adaptable image classification capabilities πŸ”“ | [![Github](../../assets/github.svg)][zsic_github]
[![Open In Collab](../../assets/colab.svg)][zsic_colab]
[![Ghost](../../assets/ghost.svg)][zsic_ghost] | From 51966a84f582476566daf289589318e9cc2cd0a1 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Sat, 31 Aug 2024 04:09:14 +0530 Subject: [PATCH 32/34] docs: add multi-vector reranking, answerdotai and studies section (#1579) --- docs/mkdocs.yml | 7 +++++++ docs/src/reranking/index.md | 17 +++++++++++++++++ docs/src/studies/overview.md | 4 ++++ 3 files changed, 28 insertions(+) create mode 100644 docs/src/studies/overview.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 77dcb19d..0230caef 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -108,6 +108,7 @@ nav: - ColBERT Reranker: reranking/colbert.md - Jina Reranker: reranking/jina.md - OpenAI Reranker: reranking/openai.md + - AnswerDotAi Rerankers: reranking/answerdotai.md - Building Custom Rerankers: reranking/custom_reranker.md - Example: notebooks/lancedb_reranking.ipynb - Filtering: sql.md @@ -179,6 +180,8 @@ nav: - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - πŸ¦€ Rust: - Overview: examples/examples_rust.md + - Studies: + - β†—Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/ - πŸ’­ FAQs: faq.md - βš™οΈ API reference: - 🐍 Python: python/python.md @@ -219,6 +222,7 @@ nav: - ColBERT Reranker: reranking/colbert.md - Jina Reranker: reranking/jina.md - OpenAI Reranker: reranking/openai.md + - AnswerDotAi Rerankers: reranking/answerdotai.md - Building Custom Rerankers: reranking/custom_reranker.md - Example: notebooks/lancedb_reranking.ipynb - Filtering: sql.md @@ -286,6 +290,9 @@ nav: - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - πŸ¦€ Rust: - Overview: examples/examples_rust.md + - Studies: + - studies/overview.md + - β†—Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/ - API reference: - Overview: api_reference.md - Python: python/python.md diff --git a/docs/src/reranking/index.md b/docs/src/reranking/index.md index d2a15d6b..43e2c555 100644 --- a/docs/src/reranking/index.md +++ b/docs/src/reranking/index.md @@ -45,6 +45,23 @@ tbl.create_fts_index("text") result = tbl.query("hello", query_type="hybrid").rerank(reranker).to_list() ``` +### Multi-vector reranking +Most rerankers support reranking based on multiple vectors. To rerank based on multiple vectors, you can pass a list of vectors to the `rerank` method. Here's an example of how to rerank based on multiple vector columns using the `CrossEncoderReranker`: + +```python +from lancedb.rerankers import CrossEncoderReranker + +reranker = CrossEncoderReranker() + +query = "hello" + +res1 = table.search(query, vector_column_name="vector").limit(3) +res2 = table.search(query, vector_column_name="text_vector").limit(3) +res3 = table.search(query, vector_column_name="meta_vector").limit(3) + +reranked = reranker.rerank_multivector([res1, res2, res3], deduplicate=True) +``` + ## Available Rerankers LanceDB comes with some built-in rerankers. Here are some of the rerankers that are available in LanceDB: diff --git a/docs/src/studies/overview.md b/docs/src/studies/overview.md new file mode 100644 index 00000000..917f39c3 --- /dev/null +++ b/docs/src/studies/overview.md @@ -0,0 +1,4 @@ +This is a list of benchmarks and reports we've worked on at LanceDB. Some of these are continuously updated, while others are one-off reports. + +- [Improve retrievers with hybrid search and reranking](https://blog.lancedb.com/hybrid-search-and-reranking-report/) + From fde636ca2e48e01886225d42141336b3404ca7ed Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Mon, 2 Sep 2024 21:55:35 +0530 Subject: [PATCH 33/34] docs: fix links - quick start to embedding (#1591) --- docs/src/basic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/basic.md b/docs/src/basic.md index 88baf42d..23084871 100644 --- a/docs/src/basic.md +++ b/docs/src/basic.md @@ -572,7 +572,7 @@ You can use the embedding API when working with embedding models. It automatical --8<-- "rust/lancedb/examples/openai.rs:openai_embeddings" ``` -Learn about using the existing integrations and creating custom embedding functions in the [embedding API guide](./embeddings/). +Learn about using the existing integrations and creating custom embedding functions in the [embedding API guide](./embeddings/index.md). ## What's next From 03ef1dc081772d240aae46d8df5e080ce2fcbf2c Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Tue, 3 Sep 2024 14:00:13 +0530 Subject: [PATCH 34/34] feat: update default reranker to RRF (#1580) - Both LinearCombination (the current default) and RRF are pretty fast compared to model based rerankers. RRF is slightly faster. - In our tests RRF has also been slightly more accurate. This PR: - Makes RRF the default reranker - Removed duplicate docs for rerankers --- docs/src/hybrid_search/hybrid_search.md | 189 +----------------------- docs/src/reranking/index.md | 2 + python/python/lancedb/query.py | 11 +- 3 files changed, 12 insertions(+), 190 deletions(-) diff --git a/docs/src/hybrid_search/hybrid_search.md b/docs/src/hybrid_search/hybrid_search.md index 1503a07b..dd468a17 100644 --- a/docs/src/hybrid_search/hybrid_search.md +++ b/docs/src/hybrid_search/hybrid_search.md @@ -57,199 +57,18 @@ results = table.search(query_type="hybrid") ``` -By default, LanceDB uses `LinearCombinationReranker(weight=0.7)` to combine and rerank the results of semantic and full-text search. You can customize the hyperparameters as needed or write your own custom reranker. Here's how you can use any of the available rerankers: +By default, LanceDB uses `RRFReranker()`, which uses reciprocal rank fusion score, to combine and rerank the results of semantic and full-text search. You can customize the hyperparameters as needed or write your own custom reranker. Here's how you can use any of the available rerankers: ### `rerank()` arguments * `normalize`: `str`, default `"score"`: The method to normalize the scores. Can be "rank" or "score". If "rank", the scores are converted to ranks and then normalized. If "score", the scores are normalized directly. -* `reranker`: `Reranker`, default `LinearCombinationReranker(weight=0.7)`. +* `reranker`: `Reranker`, default `RRF()`. The reranker to use. If not specified, the default reranker is used. ## Available Rerankers -LanceDB provides a number of re-rankers out of the box. You can use any of these re-rankers by passing them to the `rerank()` method. Here's a list of available re-rankers: - -### Linear Combination Reranker -This is the default re-ranker used by LanceDB. It combines the results of semantic and full-text search using a linear combination of the scores. The weights for the linear combination can be specified. It defaults to 0.7, i.e, 70% weight for semantic search and 30% weight for full-text search. +LanceDB provides a number of re-rankers out of the box. You can use any of these re-rankers by passing them to the `rerank()` method. +Go to [Rerankers](../reranking/index.md) to learn more about using the available rerankers and implementing custom rerankers. -```python -from lancedb.rerankers import LinearCombinationReranker - -reranker = LinearCombinationReranker(weight=0.3) # Use 0.3 as the weight for vector search - -results = table.search("rebel", query_type="hybrid").rerank(reranker=reranker).to_pandas() -``` - -### Arguments ----------------- -* `weight`: `float`, default `0.7`: - The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`. -* `fill`: `float`, default `1.0`: - The score to give to results that are only in one of the two result sets.This is treated as penalty, so a higher value means a lower score. - TODO: We should just hardcode this-- its pretty confusing as we invert scores to calculate final score -* `return_score` : str, default `"relevance"` - options are "relevance" or "all" - The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. - -### Cohere Reranker -This re-ranker uses the [Cohere](https://cohere.ai/) API to combine the results of semantic and full-text search. You can use this re-ranker by passing `CohereReranker()` to the `rerank()` method. Note that you'll need to set the `COHERE_API_KEY` environment variable to use this re-ranker. - -```python -from lancedb.rerankers import CohereReranker - -reranker = CohereReranker() - -results = table.search("vampire weekend", query_type="hybrid").rerank(reranker=reranker).to_pandas() -``` - -### Arguments ----------------- -* `model_name` : str, default `"rerank-english-v2.0"` - The name of the cross encoder model to use. Available cohere models are: - - rerank-english-v2.0 - - rerank-multilingual-v2.0 -* `column` : str, default `"text"` - The name of the column to use as input to the cross encoder model. -* `top_n` : str, default `None` - The number of results to return. If None, will return all results. - -!!! Note - Only returns `_relevance_score`. Does not support `return_score = "all"`. - -### Cross Encoder Reranker -This reranker uses the [Sentence Transformers](https://www.sbert.net/) library to combine the results of semantic and full-text search. You can use it by passing `CrossEncoderReranker()` to the `rerank()` method. - -```python -from lancedb.rerankers import CrossEncoderReranker - -reranker = CrossEncoderReranker() - -results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas() -``` - - -### Arguments ----------------- -* `model` : str, default `"cross-encoder/ms-marco-TinyBERT-L-6"` - The name of the cross encoder model to use. Available cross encoder models can be found [here](https://www.sbert.net/docs/pretrained_cross-encoders.html) -* `column` : str, default `"text"` - The name of the column to use as input to the cross encoder model. -* `device` : str, default `None` - The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". - -!!! Note - Only returns `_relevance_score`. Does not support `return_score = "all"`. - - -### ColBERT Reranker -This reranker uses the ColBERT model to combine the results of semantic and full-text search. You can use it by passing `ColbertrReranker()` to the `rerank()` method. - -ColBERT reranker model calculates relevance of given docs against the query and don't take existing fts and vector search scores into account, so it currently only supports `return_score="relevance"`. By default, it looks for `text` column to rerank the results. But you can specify the column name to use as input to the cross encoder model as described below. - -```python -from lancedb.rerankers import ColbertReranker - -reranker = ColbertReranker() - -results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas() -``` - -### Arguments ----------------- -* `model_name` : `str`, default `"colbert-ir/colbertv2.0"` - The name of the cross encoder model to use. -* `column` : `str`, default `"text"` - The name of the column to use as input to the cross encoder model. -* `return_score` : `str`, default `"relevance"` - options are `"relevance"` or `"all"`. Only `"relevance"` is supported for now. - -!!! Note - Only returns `_relevance_score`. Does not support `return_score = "all"`. - -### OpenAI Reranker -This reranker uses the OpenAI API to combine the results of semantic and full-text search. You can use it by passing `OpenaiReranker()` to the `rerank()` method. - -!!! Note - This prompts chat model to rerank results which is not a dedicated reranker model. This should be treated as experimental. - -!!! Tip - - You might run out of token limit so set the search `limits` based on your token limit. - - It is recommended to use gpt-4-turbo-preview, the default model, older models might lead to undesired behaviour - -```python -from lancedb.rerankers import OpenaiReranker - -reranker = OpenaiReranker() - -results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas() -``` - -### Arguments ----------------- -* `model_name` : `str`, default `"gpt-4-turbo-preview"` - The name of the cross encoder model to use. -* `column` : `str`, default `"text"` - The name of the column to use as input to the cross encoder model. -* `return_score` : `str`, default `"relevance"` - options are "relevance" or "all". Only "relevance" is supported for now. -* `api_key` : `str`, default `None` - The API key to use. If None, will use the OPENAI_API_KEY environment variable. - - -## Building Custom Rerankers -You can build your own custom reranker by subclassing the `Reranker` class and implementing the `rerank_hybrid()` method. Here's an example of a custom reranker that combines the results of semantic and full-text search using a linear combination of the scores. - -The `Reranker` base interface comes with a `merge_results()` method that can be used to combine the results of semantic and full-text search. This is a vanilla merging algorithm that simply concatenates the results and removes the duplicates without taking the scores into consideration. It only keeps the first copy of the row encountered. This works well in cases that don't require the scores of semantic and full-text search to combine the results. If you want to use the scores or want to support `return_score="all"`, you'll need to implement your own merging algorithm. - -```python - -from lancedb.rerankers import Reranker -import pyarrow as pa - -class MyReranker(Reranker): - def __init__(self, param1, param2, ..., return_score="relevance"): - super().__init__(return_score) - self.param1 = param1 - self.param2 = param2 - - def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table): - # Use the built-in merging function - combined_result = self.merge_results(vector_results, fts_results) - - # Do something with the combined results - # ... - - # Return the combined results - return combined_result - -``` - -### Example of a Custom Reranker -For the sake of simplicity let's build custom reranker that just enchances the Cohere Reranker by accepting a filter query, and accept other CohereReranker params as kwags. - -```python - -from typing import List, Union -import pandas as pd -from lancedb.rerankers import CohereReranker - -class MofidifiedCohereReranker(CohereReranker): - def __init__(self, filters: Union[str, List[str]], **kwargs): - super().__init__(**kwargs) - filters = filters if isinstance(filters, list) else [filters] - self.filters = filters - - def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table: - combined_result = super().rerank_hybrid(query, vector_results, fts_results) - df = combined_result.to_pandas() - for filter in self.filters: - df = df.query("not text.str.contains(@filter)") - - return pa.Table.from_pandas(df) - -``` - -!!! tip - The `vector_results` and `fts_results` are pyarrow tables. You can convert them to pandas dataframes using `to_pandas()` method and perform any operations you want. After you are done, you can convert the dataframe back to pyarrow table using `pa.Table.from_pandas()` method and return it. diff --git a/docs/src/reranking/index.md b/docs/src/reranking/index.md index 43e2c555..746c5d4e 100644 --- a/docs/src/reranking/index.md +++ b/docs/src/reranking/index.md @@ -71,6 +71,8 @@ LanceDB comes with some built-in rerankers. Here are some of the rerankers that - [OpenAI Reranker](./openai.md) - [Linear Combination Reranker](./linear_combination.md) - [Jina Reranker](./jina.md) +- [AnswerDotAI Rerankers](./answerdotai.md) +- [Reciprocal Rank Fusion Reranker](./rrf.md) ## Creating Custom Rerankers diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 7e7538b0..9da90987 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -35,7 +35,7 @@ import pydantic from . import __version__ from .arrow import AsyncRecordBatchReader from .rerankers.base import Reranker -from .rerankers.linear_combination import LinearCombinationReranker +from .rerankers.rrf import RRFReranker from .util import safe_import_pandas if TYPE_CHECKING: @@ -916,7 +916,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): """ A query builder that performs hybrid vector and full text search. Results are combined and reranked based on the specified reranker. - By default, the results are reranked using the LinearCombinationReranker. + By default, the results are reranked using the RRFReranker, which + uses reciprocal rank fusion score for reranking. To make the vector and fts results comparable, the scores are normalized. Instead of normalizing scores, the `normalize` parameter can be set to "rank" @@ -935,7 +936,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): self._vector_column = vector_column self._fts_columns = fts_columns self._norm = "score" - self._reranker = LinearCombinationReranker(weight=0.7, fill=1.0) + self._reranker = RRFReranker() self._nprobes = None self._refine_factor = None @@ -1066,7 +1067,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): def rerank( self, normalize="score", - reranker: Reranker = LinearCombinationReranker(weight=0.7, fill=1.0), + reranker: Reranker = RRFReranker(), ) -> LanceHybridQueryBuilder: """ Rerank the hybrid search results using the specified reranker. The reranker @@ -1078,7 +1079,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): The method to normalize the scores. Can be "rank" or "score". If "rank", the scores are converted to ranks and then normalized. If "score", the scores are normalized directly. - reranker: Reranker, default LinearCombinationReranker(weight=0.7, fill=1.0) + reranker: Reranker, default RRFReranker() The reranker to use. Must be an instance of Reranker class. Returns -------