From 79960b254e246ce05a911be47630c5fb7361f1f4 Mon Sep 17 00:00:00 2001 From: LuQQiu Date: Tue, 9 Sep 2025 09:13:22 -0700 Subject: [PATCH] fix: add partition statistics to MetadataEraser (#2637) Some of the data fusion optimizers optimize based on data statistics (e.g. total bytes, number of rows). If those statistics are not supplied, optimizers cannot optimize on top. One example is Anti Hash Join which can optimize from LeftAnti (Left: big table, Right: small table) to RightAnti (Left: small table, Right: big table). Left Anti requires reading the whole big & small table while RightAnti only requires reading the whole left table and supports limit push down to only read partial of big table --- rust/lancedb/src/table/datafusion.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/rust/lancedb/src/table/datafusion.rs b/rust/lancedb/src/table/datafusion.rs index 949c47fe..9d2105d5 100644 --- a/rust/lancedb/src/table/datafusion.rs +++ b/rust/lancedb/src/table/datafusion.rs @@ -121,6 +121,10 @@ impl ExecutionPlan for MetadataEraserExec { as SendableRecordBatchStream, ) } + + fn partition_statistics(&self, partition: Option) -> DataFusionResult { + self.input.partition_statistics(partition) + } } #[derive(Debug)] @@ -227,6 +231,7 @@ pub mod tests { prelude::{SessionConfig, SessionContext}, }; use datafusion_catalog::TableProvider; + use datafusion_common::stats::Precision; use datafusion_execution::SendableRecordBatchStream; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder}; use futures::{StreamExt, TryStreamExt}; @@ -509,4 +514,24 @@ pub mod tests { TestFixture::check_plan(plan, "").await; } + + #[tokio::test] + async fn test_metadata_eraser_propagates_statistics() { + let fixture = TestFixture::new().await; + + let plan = + LogicalPlanBuilder::scan("foo", provider_as_source(fixture.adapter.clone()), None) + .unwrap() + .build() + .unwrap(); + + let ctx = SessionContext::new(); + let physical_plan = ctx.state().create_physical_plan(&plan).await.unwrap(); + + assert_eq!(physical_plan.name(), "MetadataEraserExec"); + + let partition_stats = physical_plan.partition_statistics(None).unwrap(); + + assert!(matches!(partition_stats.num_rows, Precision::Exact(10))); + } }