From 98da313d155aa33e3f737df823bd8648b2a9bb70 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:14:33 -0400 Subject: [PATCH 01/13] udf module has been deprecated since DF47. html_formatter module has been deprecated since DF48. --- python/datafusion/dataframe_formatter.py | 8 +++---- python/datafusion/html_formatter.py | 29 ------------------------ python/datafusion/udf.py | 29 ------------------------ 3 files changed, 4 insertions(+), 62 deletions(-) delete mode 100644 python/datafusion/html_formatter.py delete mode 100644 python/datafusion/udf.py diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index b8af45a1b..fd2da99f0 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -748,7 +748,7 @@ def get_formatter() -> DataFrameHtmlFormatter: The global HTML formatter instance Example: - >>> from datafusion.html_formatter import get_formatter + >>> from datafusion.dataframe_formatter import get_formatter >>> formatter = get_formatter() >>> formatter.max_cell_length = 50 # Increase cell length """ @@ -762,7 +762,7 @@ def set_formatter(formatter: DataFrameHtmlFormatter) -> None: formatter: The formatter instance to use globally Example: - >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> from datafusion.dataframe_formatter import get_formatter, set_formatter >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) >>> set_formatter(custom_formatter) """ @@ -783,7 +783,7 @@ def configure_formatter(**kwargs: Any) -> None: ValueError: If any invalid parameters are provided Example: - >>> from datafusion.html_formatter import configure_formatter + >>> from datafusion.dataframe_formatter import configure_formatter >>> configure_formatter( ... max_cell_length=50, ... max_height=500, @@ -827,7 +827,7 @@ def reset_formatter() -> None: and sets it as the global formatter for all DataFrames. Example: - >>> from datafusion.html_formatter import reset_formatter + >>> from datafusion.dataframe_formatter import reset_formatter >>> reset_formatter() # Reset formatter to default settings """ formatter = DataFrameHtmlFormatter() diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py deleted file mode 100644 index 65eb1f042..000000000 --- a/python/datafusion/html_formatter.py +++ /dev/null @@ -1,29 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Deprecated module for dataframe formatting.""" - -import warnings - -from datafusion.dataframe_formatter import * # noqa: F403 - -warnings.warn( - "The module 'html_formatter' is deprecated and will be removed in the next release." - "Please use 'dataframe_formatter' instead.", - DeprecationWarning, - stacklevel=3, -) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py deleted file mode 100644 index c7265fa09..000000000 --- a/python/datafusion/udf.py +++ /dev/null @@ -1,29 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Deprecated module for user defined functions.""" - -import warnings - -from datafusion.user_defined import * # noqa: F403 - -warnings.warn( - "The module 'udf' is deprecated and will be removed in the next release. " - "Please use 'user_defined' instead.", - DeprecationWarning, - stacklevel=2, -) From bdf8c8992b8d927a471baaa2ba893fba2a49e1fa Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:18:18 -0400 Subject: [PATCH 02/13] database has been deprecated since DF48 --- python/datafusion/__init__.py | 3 +-- python/datafusion/catalog.py | 10 ---------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 2e6f81166..ff528ec8b 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -35,7 +35,7 @@ # The following imports are okay to remain as opaque to the user. from ._internal import Config -from .catalog import Catalog, Database, Table +from .catalog import Catalog, Table from .col import col, column from .common import DFSchema from .context import ( @@ -80,7 +80,6 @@ "DFSchema", "DataFrame", "DataFrameWriteOptions", - "Database", "ExecutionPlan", "Expr", "InsertOp", diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 03c0ddc68..20da5e671 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -129,11 +129,6 @@ def schema(self, name: str = "public") -> Schema: else schema ) - @deprecated("Use `schema` instead.") - def database(self, name: str = "public") -> Schema: - """Returns the database with the given ``name`` from this catalog.""" - return self.schema(name) - def register_schema( self, name: str, @@ -195,11 +190,6 @@ def table_exist(self, name: str) -> bool: return self._raw_schema.table_exist(name) -@deprecated("Use `Schema` instead.") -class Database(Schema): - """See `Schema`.""" - - class Table: """A DataFusion table. From 9d28c0bfe30b921f06503e52ac7e668b9e1b7dfd Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:27:48 -0400 Subject: [PATCH 03/13] select_columns has been deprecated since DF43 --- crates/core/src/dataframe.rs | 13 +++---------- python/datafusion/dataframe.py | 11 ----------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/crates/core/src/dataframe.rs b/crates/core/src/dataframe.rs index 72595ba81..bee2e7ee6 100644 --- a/crates/core/src/dataframe.rs +++ b/crates/core/src/dataframe.rs @@ -468,17 +468,17 @@ impl PyDataFrame { fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult { if let Ok(key) = key.extract::() { // df[col] - self.select_columns(vec![key]) + self.select_exprs(vec![key]) } else if let Ok(tuple) = key.cast::() { // df[col1, col2, col3] let keys = tuple .iter() .map(|item| item.extract::()) .collect::>>()?; - self.select_columns(keys) + self.select_exprs(keys) } else if let Ok(keys) = key.extract::>() { // df[[col1, col2, col3]] - self.select_columns(keys) + self.select_exprs(keys) } else { let message = "DataFrame can only be indexed by string index or indices".to_string(); Err(PyDataFusionError::Common(message)) @@ -554,13 +554,6 @@ impl PyDataFrame { Ok(PyTable::from(table_provider)) } - #[pyo3(signature = (*args))] - fn select_columns(&self, args: Vec) -> PyDataFusionResult { - let args = args.iter().map(|s| s.as_ref()).collect::>(); - let df = self.df.as_ref().clone().select_columns(&args)?; - Ok(Self::new(df)) - } - #[pyo3(signature = (*args))] fn select_exprs(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 9907eae8b..35ce3d818 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -395,17 +395,6 @@ def schema(self) -> pa.Schema: """ return self.df.schema() - @deprecated( - "select_columns() is deprecated. Use :py:meth:`~DataFrame.select` instead" - ) - def select_columns(self, *args: str) -> DataFrame: - """Filter the DataFrame by columns. - - Returns: - DataFrame only containing the specified columns. - """ - return self.select(*args) - def select_exprs(self, *args: str) -> DataFrame: """Project arbitrary list of expression strings into a new DataFrame. From 1b775517296619251438a7b40f16583372e427c4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:34:58 -0400 Subject: [PATCH 04/13] unnest_column has been deprecated since DF42 --- crates/core/src/dataframe.rs | 13 ------------- python/datafusion/dataframe.py | 10 ---------- 2 files changed, 23 deletions(-) diff --git a/crates/core/src/dataframe.rs b/crates/core/src/dataframe.rs index bee2e7ee6..8a67dffc9 100644 --- a/crates/core/src/dataframe.rs +++ b/crates/core/src/dataframe.rs @@ -868,19 +868,6 @@ impl PyDataFrame { Ok(Self::new(new_df)) } - #[pyo3(signature = (column, preserve_nulls=true))] - fn unnest_column(&self, column: &str, preserve_nulls: bool) -> PyDataFusionResult { - // TODO: expose RecursionUnnestOptions - // REF: https://github.com/apache/datafusion/pull/11577 - let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); - let df = self - .df - .as_ref() - .clone() - .unnest_columns_with_options(&[column], unnest_options)?; - Ok(Self::new(df)) - } - #[pyo3(signature = (columns, preserve_nulls=true))] fn unnest_columns( &self, diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 35ce3d818..daee2698c 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -30,11 +30,6 @@ overload, ) -try: - from warnings import deprecated # Python 3.13+ -except ImportError: - from typing_extensions import deprecated # Python 3.12 - from datafusion._internal import DataFrame as DataFrameInternal from datafusion._internal import DataFrameWriteOptions as DataFrameWriteOptionsInternal from datafusion._internal import InsertOp as InsertOpInternal @@ -1298,11 +1293,6 @@ def count(self) -> int: """ return self.df.count() - @deprecated("Use :py:func:`unnest_columns` instead.") - def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: - """See :py:func:`unnest_columns`.""" - return DataFrame(self.df.unnest_column(column, preserve_nulls=preserve_nulls)) - def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFrame: """Expand columns of arrays into a single row per array element. From e70cd2840261e80f1f06b0a0006b7cef38174f29 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:36:49 -0400 Subject: [PATCH 05/13] display_name has been deprecated since DF42 --- python/datafusion/expr.py | 15 --------------- python/tests/test_expr.py | 21 --------------------- 2 files changed, 36 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 35388468c..7cd74ecd5 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -27,11 +27,6 @@ from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING, Any, ClassVar -try: - from warnings import deprecated # Python 3.13+ -except ImportError: - from typing_extensions import deprecated # Python 3.12 - import pyarrow as pa from ._internal import expr as expr_internal @@ -356,16 +351,6 @@ def to_variant(self) -> Any: """Convert this expression into a python object if possible.""" return self.expr.to_variant() - @deprecated( - "display_name() is deprecated. Use :py:meth:`~Expr.schema_name` instead" - ) - def display_name(self) -> str: - """Returns the name of this expression as it should appear in a schema. - - This name will not include any CAST expressions. - """ - return self.schema_name() - def schema_name(self) -> str: """Returns the name of this expression as it should appear in a schema. diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 9a287c1f7..1cf824a15 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -319,27 +319,6 @@ def test_expr_getitem() -> None: assert array_values == [2, 5, None, None] -def test_display_name_deprecation(): - import warnings - - expr = col("foo") - with warnings.catch_warnings(record=True) as w: - # Cause all warnings to always be triggered - warnings.simplefilter("always") - - # should trigger warning - name = expr.display_name() - - # Verify some things - assert len(w) == 1 - assert issubclass(w[-1].category, DeprecationWarning) - assert "deprecated" in str(w[-1].message) - - # returns appropriate result - assert name == expr.schema_name() - assert name == "foo" - - @pytest.fixture def df(): ctx = SessionContext() From e6e6459c8b269e57b218cc83a59545a88c01623f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:42:16 -0400 Subject: [PATCH 06/13] window() has been deprecated since DF50 --- crates/core/src/functions.rs | 133 +-------------------------------- python/datafusion/functions.py | 55 +------------- 2 files changed, 4 insertions(+), 184 deletions(-) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index f173aaa51..7feb62d79 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -18,20 +18,14 @@ use std::collections::HashMap; use datafusion::common::{Column, ScalarValue, TableReference}; -use datafusion::execution::FunctionRegistry; -use datafusion::functions_aggregate::all_default_aggregate_functions; -use datafusion::functions_window::all_default_window_functions; -use datafusion::logical_expr::expr::{ - Alias, FieldMetadata, NullTreatment as DFNullTreatment, WindowFunction, WindowFunctionParams, -}; -use datafusion::logical_expr::{Expr, ExprFunctionExt, WindowFrame, WindowFunctionDefinition, lit}; +use datafusion::logical_expr::expr::{Alias, FieldMetadata, NullTreatment as DFNullTreatment}; +use datafusion::logical_expr::{Expr, ExprFunctionExt, lit}; use datafusion::{functions, functions_aggregate, functions_window}; use pyo3::prelude::*; use pyo3::wrap_pyfunction; use crate::common::data_type::{NullTreatment, PyScalarValue}; -use crate::context::PySessionContext; -use crate::errors::{PyDataFusionError, PyDataFusionResult}; +use crate::errors::PyDataFusionResult; use crate::expr::PyExpr; use crate::expr::conditional_expr::PyCaseBuilder; use crate::expr::sort_expr::{PySortExpr, to_sort_expressions}; @@ -306,126 +300,6 @@ fn when(when: PyExpr, then: PyExpr) -> PyResult { Ok(PyCaseBuilder::new(None).when(when, then)) } -/// Helper function to find the appropriate window function. -/// -/// Search procedure: -/// 1) Search built in window functions, which are being deprecated. -/// 1) If a session context is provided: -/// 1) search User Defined Aggregate Functions (UDAFs) -/// 1) search registered window functions -/// 1) search registered aggregate functions -/// 1) If no function has been found, search default aggregate functions. -/// -/// NOTE: we search the built-ins first because the `UDAF` versions currently do not have the same behavior. -fn find_window_fn( - name: &str, - ctx: Option, -) -> PyDataFusionResult { - if let Some(ctx) = ctx { - // search UDAFs - let udaf = ctx - .ctx - .udaf(name) - .map(WindowFunctionDefinition::AggregateUDF) - .ok(); - - if let Some(udaf) = udaf { - return Ok(udaf); - } - - let session_state = ctx.ctx.state(); - - // search registered window functions - let window_fn = session_state - .window_functions() - .get(name) - .map(|f| WindowFunctionDefinition::WindowUDF(f.clone())); - - if let Some(window_fn) = window_fn { - return Ok(window_fn); - } - - // search registered aggregate functions - let agg_fn = session_state - .aggregate_functions() - .get(name) - .map(|f| WindowFunctionDefinition::AggregateUDF(f.clone())); - - if let Some(agg_fn) = agg_fn { - return Ok(agg_fn); - } - } - - // search default aggregate functions - let agg_fn = all_default_aggregate_functions() - .iter() - .find(|v| v.name() == name || v.aliases().contains(&name.to_string())) - .map(|f| WindowFunctionDefinition::AggregateUDF(f.clone())); - - if let Some(agg_fn) = agg_fn { - return Ok(agg_fn); - } - - // search default window functions - let window_fn = all_default_window_functions() - .iter() - .find(|v| v.name() == name || v.aliases().contains(&name.to_string())) - .map(|f| WindowFunctionDefinition::WindowUDF(f.clone())); - - if let Some(window_fn) = window_fn { - return Ok(window_fn); - } - - Err(PyDataFusionError::Common(format!( - "window function `{name}` not found" - ))) -} - -/// Creates a new Window function expression -#[allow(clippy::too_many_arguments)] -#[pyfunction] -#[pyo3(signature = (name, args, partition_by=None, order_by=None, window_frame=None, filter=None, distinct=false, ctx=None))] -fn window( - name: &str, - args: Vec, - partition_by: Option>, - order_by: Option>, - window_frame: Option, - filter: Option, - distinct: bool, - ctx: Option, -) -> PyResult { - let fun = find_window_fn(name, ctx)?; - - let window_frame = window_frame - .map(|w| w.into()) - .unwrap_or(WindowFrame::new(order_by.as_ref().map(|v| !v.is_empty()))); - let filter = filter.map(|f| f.expr.into()); - - Ok(PyExpr { - expr: datafusion::logical_expr::Expr::WindowFunction(Box::new(WindowFunction { - fun, - params: WindowFunctionParams { - args: args.into_iter().map(|x| x.expr).collect::>(), - partition_by: partition_by - .unwrap_or_default() - .into_iter() - .map(|x| x.expr) - .collect::>(), - order_by: order_by - .unwrap_or_default() - .into_iter() - .map(|x| x.into()) - .collect::>(), - window_frame, - filter, - distinct, - null_treatment: None, - }, - })), - }) -} - // Generates a [pyo3] wrapper for associated aggregate functions. // All of the builder options are exposed to the python internal // function and we rely on the wrappers to only use those that @@ -1186,7 +1060,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(self::uuid))?; // Use self to avoid name collision m.add_wrapped(wrap_pyfunction!(var_pop))?; m.add_wrapped(wrap_pyfunction!(var_sample))?; - m.add_wrapped(wrap_pyfunction!(window))?; m.add_wrapped(wrap_pyfunction!(regr_avgx))?; m.add_wrapped(wrap_pyfunction!(regr_avgy))?; m.add_wrapped(wrap_pyfunction!(regr_count))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 9dfabb62d..841cd9c0b 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,7 +18,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import Any import pyarrow as pa @@ -29,19 +29,11 @@ Expr, SortExpr, SortKey, - WindowFrame, expr_list_to_raw_expr_list, sort_list_to_raw_sort_list, sort_or_default, ) -try: - from warnings import deprecated # Python 3.13+ -except ImportError: - from typing_extensions import deprecated # Python 3.12 - -if TYPE_CHECKING: - from datafusion.context import SessionContext __all__ = [ "abs", "acos", @@ -339,8 +331,6 @@ "var_sample", "version", "when", - # Window Functions - "window", ] @@ -664,49 +654,6 @@ def when(when: Expr, then: Expr) -> CaseBuilder: return CaseBuilder(f.when(when.expr, then.expr)) -@deprecated("Prefer to call Expr.over() instead") -def window( - name: str, - args: list[Expr], - partition_by: list[Expr] | Expr | None = None, - order_by: list[SortKey] | SortKey | None = None, - window_frame: WindowFrame | None = None, - filter: Expr | None = None, - distinct: bool = False, - ctx: SessionContext | None = None, -) -> Expr: - """Creates a new Window function expression. - - This interface will soon be deprecated. Instead of using this interface, - users should call the window functions directly. For example, to perform a - lag use:: - - df.select(functions.lag(col("a")).partition_by(col("b")).build()) - - The ``order_by`` parameter accepts column names or expressions, e.g.:: - - window("lag", [col("a")], order_by="ts") - """ - args = [a.expr for a in args] - partition_by_raw = expr_list_to_raw_expr_list(partition_by) - order_by_raw = sort_list_to_raw_sort_list(order_by) - window_frame = window_frame.window_frame if window_frame is not None else None - ctx = ctx.ctx if ctx is not None else None - filter_raw = filter.expr if filter is not None else None - return Expr( - f.window( - name, - args, - partition_by=partition_by_raw, - order_by=order_by_raw, - window_frame=window_frame, - ctx=ctx, - filter=filter_raw, - distinct=distinct, - ) - ) - - # scalar functions def abs(arg: Expr) -> Expr: """Return the absolute value of a given number. From 599ef600dc0a5198481db7c66f6ab6e2a8b50f28 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:44:36 -0400 Subject: [PATCH 07/13] serde functions have been deprecated since DF42 --- python/datafusion/substrait.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 3115238fa..6353ef8cc 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -25,11 +25,6 @@ from typing import TYPE_CHECKING -try: - from warnings import deprecated # Python 3.13+ -except ImportError: - from typing_extensions import deprecated # Python 3.12 - from datafusion.plan import LogicalPlan from ._internal import substrait as substrait_internal @@ -88,11 +83,6 @@ def from_json(json: str) -> Plan: return Plan(substrait_internal.Plan.from_json(json)) -@deprecated("Use `Plan` instead.") -class plan(Plan): # noqa: N801 - """See `Plan`.""" - - class Serde: """Provides the ``Substrait`` serialization and deserialization.""" @@ -158,11 +148,6 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: return Plan(substrait_internal.Serde.deserialize_bytes(proto_bytes)) -@deprecated("Use `Serde` instead.") -class serde(Serde): # noqa: N801 - """See `Serde` instead.""" - - class Producer: """Generates substrait plans from a logical plan.""" @@ -184,11 +169,6 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: ) -@deprecated("Use `Producer` instead.") -class producer(Producer): # noqa: N801 - """Use `Producer` instead.""" - - class Consumer: """Generates a logical plan from a substrait plan.""" @@ -206,8 +186,3 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: return LogicalPlan( substrait_internal.Consumer.from_substrait_plan(ctx.ctx, plan.plan_internal) ) - - -@deprecated("Use `Consumer` instead.") -class consumer(Consumer): # noqa: N801 - """Use `Consumer` instead.""" From d5283eff57558f65687654ee6072af6fb323f7cd Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:48:33 -0400 Subject: [PATCH 08/13] from_arrow_table and tables have been deprecated since DF42 --- crates/core/src/context.rs | 15 --------------- python/datafusion/context.py | 16 ---------------- 2 files changed, 31 deletions(-) diff --git a/crates/core/src/context.rs b/crates/core/src/context.rs index 53994d2f5..1f707ef0b 100644 --- a/crates/core/src/context.rs +++ b/crates/core/src/context.rs @@ -1007,21 +1007,6 @@ impl PySessionContext { self.ctx.catalog_names().into_iter().collect() } - pub fn tables(&self) -> HashSet { - self.ctx - .catalog_names() - .into_iter() - .filter_map(|name| self.ctx.catalog(&name)) - .flat_map(move |catalog| { - catalog - .schema_names() - .into_iter() - .filter_map(move |name| catalog.schema(&name)) - }) - .flat_map(|schema| schema.table_names()) - .collect() - } - pub fn table(&self, name: &str, py: Python) -> PyResult { let res = wait_for_future(py, self.ctx.table(name)) .map_err(|e| PyKeyError::new_err(e.to_string()))?; diff --git a/python/datafusion/context.py b/python/datafusion/context.py index c8edc816f..8739f7aed 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -775,14 +775,6 @@ def from_arrow( """ return DataFrame(self.ctx.from_arrow(data, name)) - @deprecated("Use ``from_arrow`` instead.") - def from_arrow_table(self, data: pa.Table, name: str | None = None) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. - - This is an alias for :py:func:`from_arrow`. - """ - return self.from_arrow(data, name) - def from_pandas(self, data: pd.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. @@ -1117,14 +1109,6 @@ def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" return Catalog(self.ctx.catalog(name)) - @deprecated( - "Use the catalog provider interface ``SessionContext.Catalog`` to " - "examine available catalogs, schemas and tables" - ) - def tables(self) -> set[str]: - """Deprecated.""" - return self.ctx.tables() - def table(self, name: str) -> DataFrame: """Retrieve a previously registered table by name.""" return DataFrame(self.ctx.table(name)) From 2ab0698d961d867ee2fb8b59ee6c5b6610183595 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:50:34 -0400 Subject: [PATCH 09/13] RuntimeConfig has been deprecated since DF44 --- python/datafusion/context.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 8739f7aed..cfd14238a 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -425,11 +425,6 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder: return self -@deprecated("Use `RuntimeEnvBuilder` instead.") -class RuntimeConfig(RuntimeEnvBuilder): - """See `RuntimeEnvBuilder`.""" - - class SQLOptions: """Options to be used when performing SQL queries.""" From 885502bde18e183b61189ce76535a9cca0708c96 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 14:56:29 -0400 Subject: [PATCH 10/13] Update user documentation to remove deprecated function --- .../user-guide/common-operations/windows.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index c8fdea8f4..23ea60a38 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -175,10 +175,7 @@ it's ``Type 2`` column that are null. Aggregate Functions ------------------- -You can use any :ref:`Aggregation Function` as a window function. Currently -aggregate functions must use the deprecated -:py:func:`datafusion.functions.window` API but this should be resolved in -DataFusion 42.0 (`Issue Link `_). Here +You can use any :ref:`Aggregation Function` as a window function. Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function. @@ -189,10 +186,14 @@ power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function col('"Name"'), col('"Attack"'), col('"Type 1"'), - f.window("avg", [col('"Attack"')]) - .partition_by(col('"Type 1"')) - .build() - .alias("Average Attack"), + f.avg(col('"Attack"')).over( + Window( + window_frame=WindowFrame("rows", None, None), + partition_by=[col('"Type 1"')], + order_by=[col('"Speed"')], + null_treatment=NullTreatment.IGNORE_NULLS, + ) + ).alias("Average Attack"), ) Available Functions From d8d64419ed87cb82e85a96e07a29838ba44ff074 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 7 Apr 2026 18:33:34 -0400 Subject: [PATCH 11/13] update tpch examples for latest function uses --- examples/tpch/q02_minimum_cost_supplier.py | 8 +++----- examples/tpch/q11_important_stock_identification.py | 3 ++- examples/tpch/q15_top_supplier.py | 4 +++- examples/tpch/q17_small_quantity_order.py | 8 +++----- examples/tpch/q22_global_sales_opportunity.py | 4 +++- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py index 7390d0892..47961d2ef 100644 --- a/examples/tpch/q02_minimum_cost_supplier.py +++ b/examples/tpch/q02_minimum_cost_supplier.py @@ -32,6 +32,7 @@ import datafusion from datafusion import SessionContext, col, lit from datafusion import functions as F +from datafusion.expr import Window from util import get_data_path # This is the part we're looking for. Values selected here differ from the spec in order to run @@ -106,11 +107,8 @@ window_frame = datafusion.WindowFrame("rows", None, None) df = df.with_column( "min_cost", - F.window( - "min", - [col("ps_supplycost")], - partition_by=[col("ps_partkey")], - window_frame=window_frame, + F.min(col("ps_supplycost")).over( + Window(partition_by=[col("ps_partkey")], window_frame=window_frame) ), ) diff --git a/examples/tpch/q11_important_stock_identification.py b/examples/tpch/q11_important_stock_identification.py index 22829ab7c..de309fa64 100644 --- a/examples/tpch/q11_important_stock_identification.py +++ b/examples/tpch/q11_important_stock_identification.py @@ -29,6 +29,7 @@ from datafusion import SessionContext, WindowFrame, col, lit from datafusion import functions as F +from datafusion.expr import Window from util import get_data_path NATION = "GERMANY" @@ -71,7 +72,7 @@ window_frame = WindowFrame("rows", None, None) df = df.with_column( - "total_value", F.window("sum", [col("value")], window_frame=window_frame) + "total_value", F.sum(col("value")).over(Window(window_frame=window_frame)) ) # Limit to the parts for which there is a significant value based on the fraction of the total diff --git a/examples/tpch/q15_top_supplier.py b/examples/tpch/q15_top_supplier.py index c321048f2..5128937a7 100644 --- a/examples/tpch/q15_top_supplier.py +++ b/examples/tpch/q15_top_supplier.py @@ -31,6 +31,7 @@ import pyarrow as pa from datafusion import SessionContext, WindowFrame, col, lit from datafusion import functions as F +from datafusion.expr import Window from util import get_data_path DATE = "1996-01-01" @@ -70,7 +71,8 @@ # Use a window function to find the maximum revenue across the entire dataframe window_frame = WindowFrame("rows", None, None) df = df.with_column( - "max_revenue", F.window("max", [col("total_revenue")], window_frame=window_frame) + "max_revenue", + F.max(col("total_revenue")).over(Window(window_frame=window_frame)), ) # Find all suppliers whose total revenue is the same as the maximum diff --git a/examples/tpch/q17_small_quantity_order.py b/examples/tpch/q17_small_quantity_order.py index 6d76fe506..5ccb38422 100644 --- a/examples/tpch/q17_small_quantity_order.py +++ b/examples/tpch/q17_small_quantity_order.py @@ -30,6 +30,7 @@ from datafusion import SessionContext, WindowFrame, col, lit from datafusion import functions as F +from datafusion.expr import Window from util import get_data_path BRAND = "Brand#23" @@ -58,11 +59,8 @@ window_frame = WindowFrame("rows", None, None) df = df.with_column( "avg_quantity", - F.window( - "avg", - [col("l_quantity")], - window_frame=window_frame, - partition_by=[col("l_partkey")], + F.avg(col("l_quantity")).over( + Window(partition_by=[col("l_partkey")], window_frame=window_frame) ), ) diff --git a/examples/tpch/q22_global_sales_opportunity.py b/examples/tpch/q22_global_sales_opportunity.py index c4d115b74..a2d41b215 100644 --- a/examples/tpch/q22_global_sales_opportunity.py +++ b/examples/tpch/q22_global_sales_opportunity.py @@ -28,6 +28,7 @@ from datafusion import SessionContext, WindowFrame, col, lit from datafusion import functions as F +from datafusion.expr import Window from util import get_data_path NATION_CODES = [13, 31, 23, 29, 30, 18, 17] @@ -55,7 +56,8 @@ # current row. We want our frame to cover the entire data frame. window_frame = WindowFrame("rows", None, None) df = df.with_column( - "avg_balance", F.window("avg", [col("c_acctbal")], window_frame=window_frame) + "avg_balance", + F.avg(col("c_acctbal")).over(Window(window_frame=window_frame)), ) df.show() From a185cfba06faaabcfb43e8a03909e8f851200f0d Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 8 Apr 2026 17:30:52 -0400 Subject: [PATCH 12/13] Remove unnecessary options in example --- docs/source/user-guide/common-operations/windows.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index 23ea60a38..d77881bcf 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -190,8 +190,6 @@ power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function Window( window_frame=WindowFrame("rows", None, None), partition_by=[col('"Type 1"')], - order_by=[col('"Speed"')], - null_treatment=NullTreatment.IGNORE_NULLS, ) ).alias("Average Attack"), ) From 8f7bd79f1887e42a5ed764242459b1d0b874f285 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 8 Apr 2026 17:31:16 -0400 Subject: [PATCH 13/13] update rendering for the most recent dataframe_formatter instead of the deprecated html_formatter --- .../source/user-guide/dataframe/rendering.rst | 225 ++++++++++-------- 1 file changed, 121 insertions(+), 104 deletions(-) diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 9dea948bb..dc61a422f 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -15,18 +15,18 @@ .. specific language governing permissions and limitations .. under the License. -HTML Rendering in Jupyter -========================= +DataFrame Rendering +=================== -When working in Jupyter notebooks or other environments that support rich HTML display, -DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality -is provided by the ``_repr_html_`` method, which is automatically called by Jupyter to provide -a richer visualization than plain text output. +DataFusion provides configurable rendering for DataFrames in both plain text and HTML +formats. The ``datafusion.dataframe_formatter`` module controls how DataFrames are +displayed in Jupyter notebooks (via ``_repr_html_``), in the terminal (via ``__repr__``), +and anywhere else a string or HTML representation is needed. -Basic HTML Rendering --------------------- +Basic Rendering +--------------- -In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: +In a Jupyter environment, displaying a DataFrame triggers HTML rendering: .. code-block:: python @@ -36,74 +36,117 @@ In a Jupyter environment, simply displaying a DataFrame object will trigger HTML # Explicit display also uses HTML rendering display(df) -Customizing HTML Rendering ---------------------------- +In a terminal or when converting to string, plain text rendering is used: + +.. code-block:: python -DataFusion provides extensive customization options for HTML table rendering through the -``datafusion.html_formatter`` module. + # Plain text table output + print(df) -Configuring the HTML Formatter -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Configuring the Formatter +------------------------- -You can customize how DataFrames are rendered by configuring the formatter: +You can customize how DataFrames are rendered by configuring the global formatter: .. code-block:: python - from datafusion.html_formatter import configure_formatter - - # Change the default styling + from datafusion.dataframe_formatter import configure_formatter + configure_formatter( - max_cell_length=25, # Maximum characters in a cell before truncation - max_width=1000, # Maximum width in pixels - max_height=300, # Maximum height in pixels - max_memory_bytes=2097152, # Maximum memory for rendering (2MB) - min_rows=10, # Minimum number of rows to display - max_rows=10, # Maximum rows to display in __repr__ - enable_cell_expansion=True,# Allow expanding truncated cells - custom_css=None, # Additional custom CSS + max_cell_length=25, # Maximum characters in a cell before truncation + max_width=1000, # Maximum width in pixels (HTML only) + max_height=300, # Maximum height in pixels (HTML only) + max_memory_bytes=2097152, # Maximum memory for rendering (2MB) + min_rows=10, # Minimum number of rows to display + max_rows=10, # Maximum rows to display + enable_cell_expansion=True, # Allow expanding truncated cells (HTML only) + custom_css=None, # Additional custom CSS (HTML only) show_truncation_message=True, # Show message when data is truncated - style_provider=None, # Custom styling provider - use_shared_styles=True # Share styles across tables + style_provider=None, # Custom styling provider (HTML only) + use_shared_styles=True, # Share styles across tables (HTML only) ) The formatter settings affect all DataFrames displayed after configuration. Custom Style Providers ------------------------ +---------------------- -For advanced styling needs, you can create a custom style provider: +For HTML styling, you can create a custom style provider that implements the +``StyleProvider`` protocol: .. code-block:: python - from datafusion.html_formatter import StyleProvider, configure_formatter - - class MyStyleProvider(StyleProvider): - def get_table_styles(self): - return { - "table": "border-collapse: collapse; width: 100%;", - "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;", - "td": "border: 1px solid #ddd; padding: 8px;", - "tr:nth-child(even)": "background-color: #f2f2f2;", - } - - def get_value_styles(self, dtype, value): - """Return custom styles for specific values""" - if dtype == "float" and value < 0: - return "color: red;" - return None - + from datafusion.dataframe_formatter import configure_formatter + + class MyStyleProvider: + def get_cell_style(self): + """Return CSS style string for table data cells.""" + return "border: 1px solid #ddd; padding: 8px; text-align: left;" + + def get_header_style(self): + """Return CSS style string for table header cells.""" + return ( + "background-color: #007bff; color: white; " + "padding: 8px; text-align: left;" + ) + # Apply the custom style provider configure_formatter(style_provider=MyStyleProvider()) +Custom Cell Formatters +---------------------- + +You can register custom formatters for specific Python types. A cell formatter is any +callable that takes a value and returns a string: + +.. code-block:: python + + from datafusion.dataframe_formatter import get_formatter + + formatter = get_formatter() + + # Format floats to 2 decimal places + formatter.register_formatter(float, lambda v: f"{v:.2f}") + + # Format dates in a custom way + from datetime import date + formatter.register_formatter(date, lambda v: v.strftime("%B %d, %Y")) + +Custom Cell and Header Builders +------------------------------- + +For full control over the HTML of individual cells or headers, you can set custom +builder functions: + +.. code-block:: python + + from datafusion.dataframe_formatter import get_formatter + + formatter = get_formatter() + + # Custom cell builder receives (value, row, col, table_id) and returns HTML + def my_cell_builder(value, row, col, table_id): + color = "red" if isinstance(value, (int, float)) and value < 0 else "black" + return f"{value}" + + formatter.set_custom_cell_builder(my_cell_builder) + + # Custom header builder receives a schema field and returns HTML + def my_header_builder(field): + return f"{field.name}" + + formatter.set_custom_header_builder(my_header_builder) + Performance Optimization with Shared Styles -------------------------------------------- -The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying -multiple DataFrames in notebook environments: +The ``use_shared_styles`` parameter (enabled by default) optimizes performance when +displaying multiple DataFrames in notebook environments: .. code-block:: python - from datafusion.html_formatter import StyleProvider, configure_formatter + from datafusion.dataframe_formatter import configure_formatter + # Default: Use shared styles (recommended for notebooks) configure_formatter(use_shared_styles=True) @@ -111,76 +154,48 @@ multiple DataFrames in notebook environments: configure_formatter(use_shared_styles=False) When ``use_shared_styles=True``: + - CSS styles and JavaScript are included only once per notebook session - This reduces HTML output size and prevents style duplication - Improves rendering performance with many DataFrames - Applies consistent styling across all DataFrames -Creating a Custom Formatter ----------------------------- +Working with the Formatter Directly +------------------------------------ -For complete control over rendering, you can implement a custom formatter: +You can use ``get_formatter()`` and ``set_formatter()`` for direct access to the global +formatter instance: .. code-block:: python - from datafusion.html_formatter import Formatter, get_formatter - - class MyFormatter(Formatter): - def format_html(self, batches, schema, has_more=False, table_uuid=None): - # Create your custom HTML here - html = "
" - # ... formatting logic ... - html += "
" - return html - - # Set as the global formatter - configure_formatter(formatter_class=MyFormatter) - - # Or use the formatter just for specific operations + from datafusion.dataframe_formatter import ( + DataFrameHtmlFormatter, + get_formatter, + set_formatter, + ) + + # Get and modify the current formatter formatter = get_formatter() - custom_html = formatter.format_html(batches, schema) + print(formatter.max_rows) + print(formatter.max_cell_length) -Managing Formatters -------------------- + # Create and set a fully custom formatter + custom_formatter = DataFrameHtmlFormatter( + max_cell_length=50, + max_rows=20, + enable_cell_expansion=False, + ) + set_formatter(custom_formatter) Reset to default formatting: .. code-block:: python - from datafusion.html_formatter import reset_formatter - + from datafusion.dataframe_formatter import reset_formatter + # Reset to default settings reset_formatter() -Get the current formatter settings: - -.. code-block:: python - - from datafusion.html_formatter import get_formatter - - formatter = get_formatter() - print(formatter.max_rows) - print(formatter.theme) - -Contextual Formatting ----------------------- - -You can also use a context manager to temporarily change formatting settings: - -.. code-block:: python - - from datafusion.html_formatter import formatting_context - - # Default formatting - df.show() - - # Temporarily use different formatting - with formatting_context(max_rows=100, theme="dark"): - df.show() # Will use the temporary settings - - # Back to default formatting - df.show() - Memory and Display Controls --------------------------- @@ -188,10 +203,12 @@ You can control how much data is displayed and how much memory is used for rende .. code-block:: python + from datafusion.dataframe_formatter import configure_formatter + configure_formatter( max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display min_rows=20, # Always show at least 20 rows - max_rows=50 # Show up to 50 rows in output + max_rows=50, # Show up to 50 rows in output ) These parameters help balance comprehensive data display against performance considerations. @@ -216,7 +233,7 @@ Additional Resources * :doc:`../io/index` - I/O Guide for reading data from various sources * :doc:`../data-sources` - Comprehensive data sources guide * :ref:`io_csv` - CSV file reading -* :ref:`io_parquet` - Parquet file reading +* :ref:`io_parquet` - Parquet file reading * :ref:`io_json` - JSON file reading * :ref:`io_avro` - Avro file reading * :ref:`io_custom_table_provider` - Custom table providers