feat(python): add optional threadpool for batch requests (#981)

Currently if a batch request is given to the remote API, each query is
sent sequentially. We should allow the user to specify a threadpool.
This commit is contained in:
Chang She
2024-02-16 20:22:22 -08:00
committed by GitHub
parent d6b408e26f
commit e0277383a5
4 changed files with 37 additions and 9 deletions

View File

@@ -14,6 +14,7 @@
import inspect
import logging
import uuid
from concurrent.futures import ThreadPoolExecutor
from typing import Iterable, List, Optional, Union
from urllib.parse import urlparse
@@ -39,6 +40,7 @@ class RemoteDBConnection(DBConnection):
api_key: str,
region: str,
host_override: Optional[str] = None,
request_thread_pool: Optional[ThreadPoolExecutor] = None,
):
"""Connect to a remote LanceDB database."""
parsed = urlparse(db_url)
@@ -49,6 +51,7 @@ class RemoteDBConnection(DBConnection):
self._client = RestfulLanceDBClient(
self.db_name, region, api_key, host_override
)
self._request_thread_pool = request_thread_pool
def __repr__(self) -> str:
return f"RemoteConnect(name={self.db_name})"

View File

@@ -13,6 +13,7 @@
import logging
import uuid
from concurrent.futures import Future
from functools import cached_property
from typing import Dict, Optional, Union
@@ -270,15 +271,28 @@ class RemoteTable(Table):
and len(query.vector) > 0
and not isinstance(query.vector[0], float)
):
if self._conn._request_thread_pool is None:
def submit(name, q):
f = Future()
f.set_result(self._conn._client.query(name, q))
return f
else:
def submit(name, q):
return self._conn._request_thread_pool.submit(
self._conn._client.query, name, q
)
results = []
for v in query.vector:
v = list(v)
q = query.copy()
q.vector = v
results.append(self._conn._client.query(self._name, q))
results.append(submit(self._name, q))
return pa.concat_tables(
[add_index(r.to_arrow(), i) for i, r in enumerate(results)]
[add_index(r.result().to_arrow(), i) for i, r in enumerate(results)]
)
else:
result = self._conn._client.query(self._name, query)