feat: progress bar for add() (#3067)

## Summary

Adds progress reporting for `table.add()` so users can track large write
operations. The progress callback is available in Rust, Python (sync and
async), and through the PyO3 bindings.

### Usage

Pass `progress=True` to get an automatic tqdm bar:

```python
table.add(data, progress=True)
# 100%|██████████| 1000000/1000000 [00:12<00:00, 82345 rows/s, 45.2 MB/s | 4/4 workers]
```

Or pass a tqdm bar for more control:

```python
from tqdm import tqdm

with tqdm(unit=" rows") as pbar:
    table.add(data, progress=pbar)
```

Or use a callback for custom progress handling:

```python
def on_progress(p):
    print(f"{p['output_rows']}/{p['total_rows']} rows, "
          f"{p['active_tasks']}/{p['total_tasks']} workers, "
          f"done={p['done']}")

table.add(data, progress=on_progress)
```

In Rust:

```rust
table.add(data)
    .progress(|p| println!("{}/{:?} rows", p.output_rows(), p.total_rows()))
    .execute()
    .await?;
```

### Details

- `WriteProgress` struct in Rust with getters for `elapsed`,
`output_rows`, `output_bytes`, `total_rows`, `active_tasks`,
`total_tasks`, and `done`. Fields are private behind getters so new
fields can be added without breaking changes.
- `WriteProgressTracker` tracks progress across parallel write tasks
using a mutex for row/byte counts and atomics for active task counts.
- Active task tracking uses an RAII guard pattern (`ActiveTaskGuard`)
that increments on creation and decrements on drop.
- For remote writes, `output_bytes` reflects IPC wire bytes rather than
in-memory Arrow size. For local writes it uses in-memory Arrow size as a
proxy (see TODO below).
- tqdm postfix displays throughput (MB/s) and worker utilization
(active/total).
- The `done` callback always fires, even on error (via `FinishOnDrop`),
so progress bars are always finalized.

### TODO

- Track actual bytes written to disk for local tables. This requires
Lance to expose a progress callback from its write path. See
lance-format/lance#6247.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Will Jones
2026-03-23 16:14:13 -07:00
committed by GitHub
parent a0228036ae
commit 1d6e00b902
14 changed files with 894 additions and 48 deletions

View File

@@ -19,7 +19,7 @@ use lancedb::table::{
Table as LanceDbTable,
};
use pyo3::{
Bound, FromPyObject, PyAny, PyRef, PyResult, Python,
Bound, FromPyObject, Py, PyAny, PyRef, PyResult, Python,
exceptions::{PyKeyError, PyRuntimeError, PyValueError},
pyclass, pymethods,
types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods},
@@ -299,10 +299,12 @@ impl Table {
})
}
#[pyo3(signature = (data, mode, progress=None))]
pub fn add<'a>(
self_: PyRef<'a, Self>,
data: PyScannable,
mode: String,
progress: Option<Py<PyAny>>,
) -> PyResult<Bound<'a, PyAny>> {
let mut op = self_.inner_ref()?.add(data);
if mode == "append" {
@@ -312,6 +314,81 @@ impl Table {
} else {
return Err(PyValueError::new_err(format!("Invalid mode: {}", mode)));
}
if let Some(progress_obj) = progress {
let is_callable = Python::attach(|py| progress_obj.bind(py).is_callable());
if is_callable {
// Callback: call with a dict of progress info.
op = op.progress(move |p| {
Python::attach(|py| {
let dict = PyDict::new(py);
if let Err(e) = dict
.set_item("output_rows", p.output_rows())
.and_then(|_| dict.set_item("output_bytes", p.output_bytes()))
.and_then(|_| dict.set_item("total_rows", p.total_rows()))
.and_then(|_| {
dict.set_item("elapsed_seconds", p.elapsed().as_secs_f64())
})
.and_then(|_| dict.set_item("active_tasks", p.active_tasks()))
.and_then(|_| dict.set_item("total_tasks", p.total_tasks()))
.and_then(|_| dict.set_item("done", p.done()))
{
log::warn!("progress dict error: {e}");
return;
}
if let Err(e) = progress_obj.call1(py, (dict,)) {
log::warn!("progress callback error: {e}");
}
});
});
} else {
// tqdm-like: has update() method.
let mut last_rows: usize = 0;
let mut total_set = false;
op = op.progress(move |p| {
let current = p.output_rows();
let prev = last_rows;
last_rows = current;
Python::attach(|py| {
if let Some(total) = p.total_rows()
&& !total_set
{
if let Err(e) = progress_obj.setattr(py, "total", total) {
log::warn!("progress setattr error: {e}");
}
total_set = true;
}
let delta = current.saturating_sub(prev);
if delta > 0 {
if let Err(e) = progress_obj.call_method1(py, "update", (delta,)) {
log::warn!("progress update error: {e}");
}
// Show throughput and active workers in tqdm postfix.
let elapsed = p.elapsed().as_secs_f64();
if elapsed > 0.0 {
let mb_per_sec = p.output_bytes() as f64 / elapsed / 1_000_000.0;
let postfix = format!(
"{:.1} MB/s | {}/{} workers",
mb_per_sec,
p.active_tasks(),
p.total_tasks()
);
if let Err(e) =
progress_obj.call_method1(py, "set_postfix_str", (postfix,))
{
log::warn!("progress set_postfix_str error: {e}");
}
}
}
if p.done() {
// Force a final refresh so the bar shows completion.
if let Err(e) = progress_obj.call_method0(py, "refresh") {
log::warn!("progress refresh error: {e}");
}
}
});
});
}
}
future_into_py(self_.py(), async move {
let result = op.execute().await.infer_error()?;