feat(python): add read_consistency_interval argument (#828)

This PR refactors how we handle read consistency: does the `LanceTable`
class always pick up modifications to the table made by other instance
or processes. Users have three options they can set at the connection
level:

1. (Default) `read_consistency_interval=None` means it will not check at
all. Users can call `table.checkout_latest()` to manually check for
updates.
2. `read_consistency_interval=timedelta(0)` means **always** check for
updates, giving strong read consistency.
3. `read_consistency_interval=timedelta(seconds=20)` means check for
updates every 20 seconds. This is eventual consistency, a compromise
between the two options above.

There is now an explicit difference between a `LanceTable` that tracks
the current version and one that is fixed at a historical version. We
now enforce that users cannot write if they have checked out an old
version. They are instructed to call `checkout_latest()` before calling
the write methods.

Since `conn.open_table()` doesn't have a parameter for version, users
will only get fixed references if they call `table.checkout()`.

The difference between these two can be seen in the repr: Table that are
fixed at a particular version will have a `version` displayed in the
repr. Otherwise, the version will not be shown.

```python
>>> table
LanceTable(connection=..., name="my_table")
>>> table.checkout(1)
>>> table
LanceTable(connection=..., name="my_table", version=1)
```

I decided to not create different classes for these states, because I
think we already have enough complexity with the Cloud vs OSS table
references.

Based on #812
This commit is contained in:
Will Jones
2024-02-05 08:12:19 -08:00
committed by Weston Pace
parent 0f00cd0097
commit 39cc2fd62b
8 changed files with 322 additions and 101 deletions

View File

@@ -88,6 +88,7 @@ def test_embedding_function(tmp_path):
assert np.allclose(actual, expected)
@pytest.mark.slow
def test_embedding_function_rate_limit(tmp_path):
def _get_schema_from_model(model):
class Schema(LanceModel):

View File

@@ -12,8 +12,10 @@
# limitations under the License.
import functools
from copy import copy
from datetime import date, datetime, timedelta
from pathlib import Path
from time import sleep
from typing import List
from unittest.mock import PropertyMock, patch
@@ -25,6 +27,7 @@ import pyarrow as pa
import pytest
from pydantic import BaseModel
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction
from lancedb.db import LanceDBConnection
from lancedb.embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
@@ -35,6 +38,7 @@ from lancedb.table import LanceTable
class MockDB:
def __init__(self, uri: Path):
self.uri = uri
self.read_consistency_interval = None
@functools.cached_property
def is_managed_remote(self) -> bool:
@@ -267,39 +271,38 @@ def test_versioning(db):
def test_create_index_method():
with patch.object(LanceTable, "_reset_dataset", return_value=None):
with patch.object(
LanceTable, "_dataset", new_callable=PropertyMock
) as mock_dataset:
# Setup mock responses
mock_dataset.return_value.create_index.return_value = None
with patch.object(
LanceTable, "_dataset_mut", new_callable=PropertyMock
) as mock_dataset:
# Setup mock responses
mock_dataset.return_value.create_index.return_value = None
# Create a LanceTable object
connection = LanceDBConnection(uri="mock.uri")
table = LanceTable(connection, "test_table")
# Create a LanceTable object
connection = LanceDBConnection(uri="mock.uri")
table = LanceTable(connection, "test_table")
# Call the create_index method
table.create_index(
metric="L2",
num_partitions=256,
num_sub_vectors=96,
vector_column_name="vector",
replace=True,
index_cache_size=256,
)
# Call the create_index method
table.create_index(
metric="L2",
num_partitions=256,
num_sub_vectors=96,
vector_column_name="vector",
replace=True,
index_cache_size=256,
)
# Check that the _dataset.create_index method was called
# with the right parameters
mock_dataset.return_value.create_index.assert_called_once_with(
column="vector",
index_type="IVF_PQ",
metric="L2",
num_partitions=256,
num_sub_vectors=96,
replace=True,
accelerator=None,
index_cache_size=256,
)
# Check that the _dataset.create_index method was called
# with the right parameters
mock_dataset.return_value.create_index.assert_called_once_with(
column="vector",
index_type="IVF_PQ",
metric="L2",
num_partitions=256,
num_sub_vectors=96,
replace=True,
accelerator=None,
index_cache_size=256,
)
def test_add_with_nans(db):
@@ -792,3 +795,48 @@ def test_hybrid_search(db):
"Our father who art in heaven", query_type="hybrid"
).to_pydantic(MyTable)
assert result1 == result3
@pytest.mark.parametrize(
"consistency_interval", [None, timedelta(seconds=0), timedelta(seconds=0.1)]
)
def test_consistency(tmp_path, consistency_interval):
db = lancedb.connect(tmp_path)
table = LanceTable.create(db, "my_table", data=[{"id": 0}])
db2 = lancedb.connect(tmp_path, read_consistency_interval=consistency_interval)
table2 = db2.open_table("my_table")
assert table2.version == table.version
table.add([{"id": 1}])
if consistency_interval is None:
assert table2.version == table.version - 1
table2.checkout_latest()
assert table2.version == table.version
elif consistency_interval == timedelta(seconds=0):
assert table2.version == table.version
else:
# (consistency_interval == timedelta(seconds=0.1)
assert table2.version == table.version - 1
sleep(0.1)
assert table2.version == table.version
def test_restore_consistency(tmp_path):
db = lancedb.connect(tmp_path)
table = LanceTable.create(db, "my_table", data=[{"id": 0}])
db2 = lancedb.connect(tmp_path, read_consistency_interval=timedelta(seconds=0))
table2 = db2.open_table("my_table")
assert table2.version == table.version
# If we call checkout, it should lose consistency
table_fixed = copy(table2)
table_fixed.checkout(table.version)
# But if we call checkout_latest, it should be consistent again
table_ref_latest = copy(table_fixed)
table_ref_latest.checkout_latest()
table.add([{"id": 2}])
assert table_fixed.version == table.version - 1
assert table_ref_latest.version == table.version