feat: add table stats API (#2363)

* Add a new "table stats" API to expose basic table and fragment
statistics with local and remote table implementations

### Questions
* This is using `calculate_data_stats` to determine total bytes in the
table. This seems like a potentially expensive operation - are there any
concerns about performance for large datasets?

### Notes
* bytes_on_disk seems to be stored at the column level but there does
not seem to be a way to easily calculate total bytes per fragment. This
may need to be added in lance before we can support fragment size
(bytes) statistics.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Added a method to retrieve comprehensive table statistics, including
total rows, index counts, storage size, and detailed fragment size
metrics such as minimum, maximum, mean, and percentiles.
- Enabled fetching of table statistics from remote sources through
asynchronous requests.
- Extended table interfaces across Python, Rust, and Node.js to support
synchronous and asynchronous retrieval of table statistics.
- **Tests**
- Introduced tests to verify the accuracy of the new table statistics
feature for both populated and empty tables.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Ryan Green
2025-04-29 15:19:08 -02:30
committed by GitHub
parent 089905fe8f
commit af54e0ce06
17 changed files with 735 additions and 5 deletions

View File

@@ -578,6 +578,9 @@ class RemoteTable(Table):
):
return LOOP.run(self._table.wait_for_index(index_names, timeout))
def stats(self):
return LOOP.run(self._table.stats())
def uses_v2_manifest_paths(self) -> bool:
raise NotImplementedError(
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"

View File

@@ -739,6 +739,13 @@ class Table(ABC):
"""
raise NotImplementedError
@abstractmethod
def stats(self) -> TableStatistics:
"""
Retrieve table and fragment statistics.
"""
raise NotImplementedError
@abstractmethod
def create_scalar_index(
self,
@@ -1876,6 +1883,9 @@ class LanceTable(Table):
) -> None:
return LOOP.run(self._table.wait_for_index(index_names, timeout))
def stats(self) -> TableStatistics:
return LOOP.run(self._table.stats())
def create_scalar_index(
self,
column: str,
@@ -3170,6 +3180,12 @@ class AsyncTable:
"""
await self._inner.wait_for_index(index_names, timeout)
async def stats(self) -> TableStatistics:
"""
Retrieve table and fragment statistics.
"""
return await self._inner.stats()
async def add(
self,
data: DATA,
@@ -4068,6 +4084,82 @@ class IndexStatistics:
return getattr(self, key)
@dataclass
class TableStatistics:
"""
Statistics about a table and fragments.
Attributes
----------
total_bytes: int
The total number of bytes in the table.
num_rows: int
The total number of rows in the table.
num_indices: int
The total number of indices in the table.
fragment_stats: FragmentStatistics
Statistics about fragments in the table.
"""
total_bytes: int
num_rows: int
num_indices: int
fragment_stats: FragmentStatistics
@dataclass
class FragmentStatistics:
"""
Statistics about fragments.
Attributes
----------
num_fragments: int
The total number of fragments in the table.
num_small_fragments: int
The total number of small fragments in the table.
Small fragments have low row counts and may need to be compacted.
lengths: FragmentSummaryStats
Statistics about the number of rows in the table fragments.
"""
num_fragments: int
num_small_fragments: int
lengths: FragmentSummaryStats
@dataclass
class FragmentSummaryStats:
"""
Statistics about fragments sizes
Attributes
----------
min: int
The number of rows in the fragment with the fewest rows.
max: int
The number of rows in the fragment with the most rows.
mean: int
The mean number of rows in the fragments.
p25: int
The 25th percentile of number of rows in the fragments.
p50: int
The 50th percentile of number of rows in the fragments.
p75: int
The 75th percentile of number of rows in the fragments.
p99: int
The 99th percentile of number of rows in the fragments.
"""
min: int
max: int
mean: int
p25: int
p50: int
p75: int
p99: int
class Tags:
"""
Table tag manager.

View File

@@ -389,6 +389,50 @@ def test_table_wait_for_index_timeout():
table.wait_for_index(["id_idx"], timedelta(seconds=1))
def test_stats():
stats = {
"total_bytes": 38,
"num_rows": 2,
"num_indices": 0,
"fragment_stats": {
"num_fragments": 1,
"num_small_fragments": 1,
"lengths": {
"min": 2,
"max": 2,
"mean": 2,
"p25": 2,
"p50": 2,
"p75": 2,
"p99": 2,
},
},
}
def handler(request):
if request.path == "/v1/table/test/create/?mode=create":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"{}")
elif request.path == "/v1/table/test/stats/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(stats)
request.wfile.write(payload.encode())
else:
print(request.path)
request.send_response(404)
request.end_headers()
with mock_lancedb_connection(handler) as db:
table = db.create_table("test", [{"id": 1}])
res = table.stats()
print(f"{res=}")
assert res == stats
@contextlib.contextmanager
def query_test_table(query_handler, *, server_version=Version("0.1.0")):
def handler(request):

View File

@@ -1695,3 +1695,31 @@ def test_replace_field_metadata(tmp_path):
schema = table.schema
field = schema[0].metadata
assert field == {b"foo": b"bar"}
def test_stats(mem_db: DBConnection):
table = mem_db.create_table(
"my_table",
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
)
assert len(table) == 2
stats = table.stats()
print(f"{stats=}")
assert stats == {
"total_bytes": 38,
"num_rows": 2,
"num_indices": 0,
"fragment_stats": {
"num_fragments": 1,
"num_small_fragments": 1,
"lengths": {
"min": 2,
"max": 2,
"mean": 2,
"p25": 2,
"p50": 2,
"p75": 2,
"p99": 2,
},
},
}

View File

@@ -279,6 +279,40 @@ impl Table {
})
}
pub fn stats(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let stats = inner.stats().await.infer_error()?;
Python::with_gil(|py| {
let dict = PyDict::new(py);
dict.set_item("total_bytes", stats.total_bytes)?;
dict.set_item("num_rows", stats.num_rows)?;
dict.set_item("num_indices", stats.num_indices)?;
let fragment_stats = PyDict::new(py);
fragment_stats.set_item("num_fragments", stats.fragment_stats.num_fragments)?;
fragment_stats.set_item(
"num_small_fragments",
stats.fragment_stats.num_small_fragments,
)?;
let fragment_lengths = PyDict::new(py);
fragment_lengths.set_item("min", stats.fragment_stats.lengths.min)?;
fragment_lengths.set_item("max", stats.fragment_stats.lengths.max)?;
fragment_lengths.set_item("mean", stats.fragment_stats.lengths.mean)?;
fragment_lengths.set_item("p25", stats.fragment_stats.lengths.p25)?;
fragment_lengths.set_item("p50", stats.fragment_stats.lengths.p50)?;
fragment_lengths.set_item("p75", stats.fragment_stats.lengths.p75)?;
fragment_lengths.set_item("p99", stats.fragment_stats.lengths.p99)?;
fragment_stats.set_item("lengths", fragment_lengths)?;
dict.set_item("fragment_stats", fragment_stats)?;
Ok(Some(dict.unbind()))
})
})
}
pub fn __repr__(&self) -> String {
match &self.inner {
None => format!("ClosedTable({})", self.name),