From 4d749c72a7cf914f5172168e1b7ef6c046af2646 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 19:18:51 -0400 Subject: [PATCH 1/5] Add missing conditional functions: greatest, least, nvl2, ifnull (#1449) Expose four conditional functions from upstream DataFusion that were not yet available in the Python bindings. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/core/src/functions.rs | 10 +++++ python/datafusion/functions.py | 79 ++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index 6996dca94..f5b115012 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -494,6 +494,8 @@ expr_fn!(length, string); expr_fn!(char_length, string); expr_fn!(chr, arg, "Returns the character with the given code."); expr_fn_vec!(coalesce); +expr_fn_vec!(greatest); +expr_fn_vec!(least); expr_fn!(cos, num); expr_fn!(cosh, num); expr_fn!(cot, num); @@ -543,6 +545,11 @@ expr_fn!( x y, "Returns x if x is not NULL otherwise returns y." ); +expr_fn!( + nvl2, + x y z, + "Returns y if x is not NULL; otherwise returns z." +); expr_fn!(nullif, arg_1 arg_2); expr_fn!( octet_length, @@ -983,6 +990,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(floor))?; m.add_wrapped(wrap_pyfunction!(from_unixtime))?; m.add_wrapped(wrap_pyfunction!(gcd))?; + m.add_wrapped(wrap_pyfunction!(greatest))?; // m.add_wrapped(wrap_pyfunction!(grouping))?; m.add_wrapped(wrap_pyfunction!(in_list))?; m.add_wrapped(wrap_pyfunction!(initcap))?; @@ -990,6 +998,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(iszero))?; m.add_wrapped(wrap_pyfunction!(levenshtein))?; m.add_wrapped(wrap_pyfunction!(lcm))?; + m.add_wrapped(wrap_pyfunction!(least))?; m.add_wrapped(wrap_pyfunction!(left))?; m.add_wrapped(wrap_pyfunction!(length))?; m.add_wrapped(wrap_pyfunction!(ln))?; @@ -1007,6 +1016,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(named_struct))?; m.add_wrapped(wrap_pyfunction!(nanvl))?; m.add_wrapped(wrap_pyfunction!(nvl))?; + m.add_wrapped(wrap_pyfunction!(nvl2))?; m.add_wrapped(wrap_pyfunction!(now))?; m.add_wrapped(wrap_pyfunction!(nullif))?; m.add_wrapped(wrap_pyfunction!(octet_length))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 3c8d2bcee..5614520e4 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -151,6 +151,8 @@ "floor", "from_unixtime", "gcd", + "greatest", + "ifnull", "in_list", "initcap", "isnan", @@ -159,6 +161,7 @@ "last_value", "lcm", "lead", + "least", "left", "length", "levenshtein", @@ -215,6 +218,7 @@ "ntile", "nullif", "nvl", + "nvl2", "octet_length", "order_by", "overlay", @@ -1030,6 +1034,44 @@ def gcd(x: Expr, y: Expr) -> Expr: return Expr(f.gcd(x.expr, y.expr)) +def greatest(*args: Expr) -> Expr: + """Returns the greatest value from a list of expressions. + + Returns NULL if all expressions are NULL. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 3], "b": [2, 1]}) + >>> result = df.select( + ... dfn.functions.greatest(dfn.col("a"), dfn.col("b")).alias("greatest")) + >>> result.collect_column("greatest")[0].as_py() + 2 + >>> result.collect_column("greatest")[1].as_py() + 3 + """ + args = [arg.expr for arg in args] + return Expr(f.greatest(*args)) + + +def ifnull(x: Expr, y: Expr) -> Expr: + """Returns ``x`` if ``x`` is not NULL. Otherwise returns ``y``. + + This is an alias for :py:func:`nvl`. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) + >>> result = df.select( + ... dfn.functions.ifnull(dfn.col("a"), dfn.col("b")).alias("ifnull") + ... ) + >>> result.collect_column("ifnull")[0].as_py() + 0 + >>> result.collect_column("ifnull")[1].as_py() + 1 + """ + return nvl(x, y) + + def initcap(string: Expr) -> Expr: """Set the initial letter of each word to capital. @@ -1083,6 +1125,25 @@ def lcm(x: Expr, y: Expr) -> Expr: return Expr(f.lcm(x.expr, y.expr)) +def least(*args: Expr) -> Expr: + """Returns the least value from a list of expressions. + + Returns NULL if all expressions are NULL. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 3], "b": [2, 1]}) + >>> result = df.select( + ... dfn.functions.least(dfn.col("a"), dfn.col("b")).alias("least")) + >>> result.collect_column("least")[0].as_py() + 1 + >>> result.collect_column("least")[1].as_py() + 1 + """ + args = [arg.expr for arg in args] + return Expr(f.least(*args)) + + def left(string: Expr, n: Expr) -> Expr: """Returns the first ``n`` characters in the ``string``. @@ -1267,6 +1328,24 @@ def nvl(x: Expr, y: Expr) -> Expr: return Expr(f.nvl(x.expr, y.expr)) +def nvl2(x: Expr, y: Expr, z: Expr) -> Expr: + """Returns ``y`` if ``x`` is not NULL. Otherwise returns ``z``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [10, 20], "c": [30, 40]}) + >>> result = df.select( + ... dfn.functions.nvl2( + ... dfn.col("a"), dfn.col("b"), dfn.col("c")).alias("nvl2") + ... ) + >>> result.collect_column("nvl2")[0].as_py() + 30 + >>> result.collect_column("nvl2")[1].as_py() + 20 + """ + return Expr(f.nvl2(x.expr, y.expr, z.expr)) + + def octet_length(arg: Expr) -> Expr: """Returns the number of bytes of a string. From b8daf93ecec6b2320b7c5611e4e16619614a93b1 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 19:23:08 -0400 Subject: [PATCH 2/5] Add unit tests for greatest, least, nvl2, and ifnull functions Tests cover multiple data types (integers, strings), null handling (all-null, partial-null), multiple arguments, and ifnull/nvl equivalence. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 162 +++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 08420826d..60a768d47 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1468,3 +1468,165 @@ def test_coalesce(df): assert result.column(0) == pa.array( ["Hello", "fallback", "!"], type=pa.string_view() ) + + +def test_greatest(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 5, None]), + pa.array([3, 2, None]), + pa.array([2, 8, None]), + ], + names=["a", "b", "c"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # Test greatest with two columns + result = df_test.select( + f.greatest(column("a"), column("b")).alias("greatest_ab") + ).collect()[0] + assert result.column(0) == pa.array([3, 5, None], type=pa.int64()) + + # Test greatest with three columns + result = df_test.select( + f.greatest(column("a"), column("b"), column("c")).alias("greatest_abc") + ).collect()[0] + assert result.column(0) == pa.array([3, 8, None], type=pa.int64()) + + # Test greatest with nulls mixed in (partial nulls) + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array([None, 10]), + pa.array([5, None]), + ], + names=["x", "y"], + ) + df_test2 = ctx.create_dataframe([[batch2]]) + result = df_test2.select(f.greatest(column("x"), column("y")).alias("g")).collect()[ + 0 + ] + assert result.column(0) == pa.array([5, 10], type=pa.int64()) + + # Test greatest with string columns + batch3 = pa.RecordBatch.from_arrays( + [ + pa.array(["apple", "cherry"]), + pa.array(["banana", "apricot"]), + ], + names=["s1", "s2"], + ) + df_test3 = ctx.create_dataframe([[batch3]]) + result = df_test3.select( + f.greatest(column("s1"), column("s2")).alias("g") + ).collect()[0] + assert result.column(0).to_pylist() == ["banana", "cherry"] + + +def test_least(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 5, None]), + pa.array([3, 2, None]), + pa.array([2, 8, None]), + ], + names=["a", "b", "c"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # Test least with two columns + result = df_test.select( + f.least(column("a"), column("b")).alias("least_ab") + ).collect()[0] + assert result.column(0) == pa.array([1, 2, None], type=pa.int64()) + + # Test least with three columns + result = df_test.select( + f.least(column("a"), column("b"), column("c")).alias("least_abc") + ).collect()[0] + assert result.column(0) == pa.array([1, 2, None], type=pa.int64()) + + # Test least with partial nulls + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array([None, 10]), + pa.array([5, None]), + ], + names=["x", "y"], + ) + df_test2 = ctx.create_dataframe([[batch2]]) + result = df_test2.select(f.least(column("x"), column("y")).alias("l")).collect()[0] + assert result.column(0) == pa.array([5, 10], type=pa.int64()) + + # Test least with string columns + batch3 = pa.RecordBatch.from_arrays( + [ + pa.array(["apple", "cherry"]), + pa.array(["banana", "apricot"]), + ], + names=["s1", "s2"], + ) + df_test3 = ctx.create_dataframe([[batch3]]) + result = df_test3.select(f.least(column("s1"), column("s2")).alias("l")).collect()[ + 0 + ] + assert result.column(0).to_pylist() == ["apple", "apricot"] + + +def test_nvl2(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([None, 1, None, 4]), + pa.array([10, 20, 30, 40]), + pa.array([100, 200, 300, 400]), + ], + names=["a", "b", "c"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # nvl2 returns b when a is not null, c when a is null + result = df_test.select( + f.nvl2(column("a"), column("b"), column("c")).alias("result") + ).collect()[0] + assert result.column(0) == pa.array([100, 20, 300, 40], type=pa.int64()) + + # Test with string columns + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array(["x", None]), + pa.array(["not_null", "not_null"]), + pa.array(["is_null", "is_null"]), + ], + names=["a", "b", "c"], + ) + df_test2 = ctx.create_dataframe([[batch2]]) + result = df_test2.select( + f.nvl2(column("a"), column("b"), column("c")).alias("result") + ).collect()[0] + assert result.column(0).to_pylist() == ["not_null", "is_null"] + + +def test_ifnull(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([None, 1, None, 4]), + pa.array([10, 20, 30, 40]), + ], + names=["a", "b"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # ifnull returns a when a is not null, b when a is null (same as nvl) + result = df_test.select( + f.ifnull(column("a"), column("b")).alias("result") + ).collect()[0] + assert result.column(0) == pa.array([10, 1, 30, 4], type=pa.int64()) + + # Verify ifnull matches nvl behavior + result_nvl = df_test.select( + f.nvl(column("a"), column("b")).alias("nvl_result") + ).collect()[0] + assert result.column(0) == result_nvl.column(0) From 2d5615c8c8e4163d806698aea4a745f7ac49706d Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 19:30:06 -0400 Subject: [PATCH 3/5] Use standard alias docstring pattern for ifnull Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5614520e4..a1595c441 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1056,18 +1056,8 @@ def greatest(*args: Expr) -> Expr: def ifnull(x: Expr, y: Expr) -> Expr: """Returns ``x`` if ``x`` is not NULL. Otherwise returns ``y``. - This is an alias for :py:func:`nvl`. - - Examples: - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) - >>> result = df.select( - ... dfn.functions.ifnull(dfn.col("a"), dfn.col("b")).alias("ifnull") - ... ) - >>> result.collect_column("ifnull")[0].as_py() - 0 - >>> result.collect_column("ifnull")[1].as_py() - 1 + See Also: + This is an alias for :py:func:`nvl`. """ return nvl(x, y) From e299daff6eae8edadf43429df927d077ed7b6846 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 13:10:12 -0400 Subject: [PATCH 4/5] remove unused df fixture and fix parameter shadowing --- python/datafusion/functions.py | 8 ++++---- python/tests/test_functions.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index a1595c441..ac276cf22 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1049,8 +1049,8 @@ def greatest(*args: Expr) -> Expr: >>> result.collect_column("greatest")[1].as_py() 3 """ - args = [arg.expr for arg in args] - return Expr(f.greatest(*args)) + exprs = [arg.expr for arg in args] + return Expr(f.greatest(*exprs)) def ifnull(x: Expr, y: Expr) -> Expr: @@ -1130,8 +1130,8 @@ def least(*args: Expr) -> Expr: >>> result.collect_column("least")[1].as_py() 1 """ - args = [arg.expr for arg in args] - return Expr(f.least(*args)) + exprs = [arg.expr for arg in args] + return Expr(f.least(*exprs)) def left(string: Expr, n: Expr) -> Expr: diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 60a768d47..f34495463 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1470,7 +1470,7 @@ def test_coalesce(df): ) -def test_greatest(df): +def test_greatest(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays( [ @@ -1523,7 +1523,7 @@ def test_greatest(df): assert result.column(0).to_pylist() == ["banana", "cherry"] -def test_least(df): +def test_least(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays( [ @@ -1574,7 +1574,7 @@ def test_least(df): assert result.column(0).to_pylist() == ["apple", "apricot"] -def test_nvl2(df): +def test_nvl2(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays( [ @@ -1608,7 +1608,7 @@ def test_nvl2(df): assert result.column(0).to_pylist() == ["not_null", "is_null"] -def test_ifnull(df): +def test_ifnull(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays( [ From 835f91fe94e422f8b88651b9a79952895ef9e0f0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 13:46:24 -0400 Subject: [PATCH 5/5] Refactor conditional function tests into parametrized test suite Replace separate test functions for coalesce, greatest, least, nvl, nvl2, ifnull with a single parametrized test using a shared fixture. Adds coverage for nvl, nullif (previously untested), datetime and boolean types, literal fallbacks, and variadic calls. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 451 ++++++++++++++++++--------------- 1 file changed, 240 insertions(+), 211 deletions(-) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index f34495463..fd05b0e86 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1409,224 +1409,253 @@ def test_alias_with_metadata(df): assert df.schema().field("b").metadata == {b"key": b"value"} -def test_coalesce(df): - # Create a DataFrame with null values +@pytest.fixture +def df_with_nulls(): ctx = SessionContext() + # Rows: + # 0: both values present + # 1: a/d/h/k null, b/e/i/l present + # 2: a/d/h/k present, b/e/i/l null + # 3: all null batch = pa.RecordBatch.from_arrays( [ - pa.array(["Hello", None, "!"]), # string column with null - pa.array([4, None, 6]), # integer column with null - pa.array(["hello ", None, " !"]), # string column with null + pa.array([1, None, 3, None], type=pa.int64()), + pa.array([5, 10, None, None], type=pa.int64()), + pa.array([20, 30, 40, None], type=pa.int64()), + pa.array(["apple", None, "cherry", None], type=pa.utf8()), + pa.array(["banana", "date", None, None], type=pa.utf8()), + pa.array(["x", "y", "z", None], type=pa.utf8()), pa.array( [ - datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2020, 1, 1, tzinfo=DEFAULT_TZ), None, - datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), - ] - ), # datetime with null - pa.array([False, None, True]), # boolean column with null - ], - names=["a", "b", "c", "d", "e"], - ) - df_with_nulls = ctx.create_dataframe([[batch]]) - - # Test coalesce with different data types - result_df = df_with_nulls.select( - f.coalesce(column("a"), literal("default")).alias("a_coalesced"), - f.coalesce(column("b"), literal(0)).alias("b_coalesced"), - f.coalesce(column("c"), literal("default")).alias("c_coalesced"), - f.coalesce(column("d"), literal(datetime(2000, 1, 1, tzinfo=DEFAULT_TZ))).alias( - "d_coalesced" - ), - f.coalesce(column("e"), literal(value=False)).alias("e_coalesced"), - ) - - result = result_df.collect()[0] - - # Verify results - assert result.column(0) == pa.array( - ["Hello", "default", "!"], type=pa.string_view() - ) - assert result.column(1) == pa.array([4, 0, 6], type=pa.int64()) - assert result.column(2) == pa.array( - ["hello ", "default", " !"], type=pa.string_view() - ) - assert result.column(3).to_pylist() == [ - datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), - datetime(2000, 1, 1, tzinfo=DEFAULT_TZ), - datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), - ] - assert result.column(4) == pa.array([False, False, True], type=pa.bool_()) - - # Test multiple arguments - result_df = df_with_nulls.select( - f.coalesce(column("a"), literal(None), literal("fallback")).alias( - "multi_coalesce" - ) - ) - result = result_df.collect()[0] - assert result.column(0) == pa.array( - ["Hello", "fallback", "!"], type=pa.string_view() - ) - - -def test_greatest(): - ctx = SessionContext() - batch = pa.RecordBatch.from_arrays( - [ - pa.array([1, 5, None]), - pa.array([3, 2, None]), - pa.array([2, 8, None]), - ], - names=["a", "b", "c"], - ) - df_test = ctx.create_dataframe([[batch]]) - - # Test greatest with two columns - result = df_test.select( - f.greatest(column("a"), column("b")).alias("greatest_ab") - ).collect()[0] - assert result.column(0) == pa.array([3, 5, None], type=pa.int64()) - - # Test greatest with three columns - result = df_test.select( - f.greatest(column("a"), column("b"), column("c")).alias("greatest_abc") - ).collect()[0] - assert result.column(0) == pa.array([3, 8, None], type=pa.int64()) - - # Test greatest with nulls mixed in (partial nulls) - batch2 = pa.RecordBatch.from_arrays( - [ - pa.array([None, 10]), - pa.array([5, None]), - ], - names=["x", "y"], - ) - df_test2 = ctx.create_dataframe([[batch2]]) - result = df_test2.select(f.greatest(column("x"), column("y")).alias("g")).collect()[ - 0 - ] - assert result.column(0) == pa.array([5, 10], type=pa.int64()) - - # Test greatest with string columns - batch3 = pa.RecordBatch.from_arrays( - [ - pa.array(["apple", "cherry"]), - pa.array(["banana", "apricot"]), - ], - names=["s1", "s2"], - ) - df_test3 = ctx.create_dataframe([[batch3]]) - result = df_test3.select( - f.greatest(column("s1"), column("s2")).alias("g") - ).collect()[0] - assert result.column(0).to_pylist() == ["banana", "cherry"] - - -def test_least(): - ctx = SessionContext() - batch = pa.RecordBatch.from_arrays( - [ - pa.array([1, 5, None]), - pa.array([3, 2, None]), - pa.array([2, 8, None]), - ], - names=["a", "b", "c"], - ) - df_test = ctx.create_dataframe([[batch]]) - - # Test least with two columns - result = df_test.select( - f.least(column("a"), column("b")).alias("least_ab") - ).collect()[0] - assert result.column(0) == pa.array([1, 2, None], type=pa.int64()) - - # Test least with three columns - result = df_test.select( - f.least(column("a"), column("b"), column("c")).alias("least_abc") - ).collect()[0] - assert result.column(0) == pa.array([1, 2, None], type=pa.int64()) - - # Test least with partial nulls - batch2 = pa.RecordBatch.from_arrays( - [ - pa.array([None, 10]), - pa.array([5, None]), - ], - names=["x", "y"], - ) - df_test2 = ctx.create_dataframe([[batch2]]) - result = df_test2.select(f.least(column("x"), column("y")).alias("l")).collect()[0] - assert result.column(0) == pa.array([5, 10], type=pa.int64()) - - # Test least with string columns - batch3 = pa.RecordBatch.from_arrays( - [ - pa.array(["apple", "cherry"]), - pa.array(["banana", "apricot"]), - ], - names=["s1", "s2"], - ) - df_test3 = ctx.create_dataframe([[batch3]]) - result = df_test3.select(f.least(column("s1"), column("s2")).alias("l")).collect()[ - 0 - ] - assert result.column(0).to_pylist() == ["apple", "apricot"] - - -def test_nvl2(): - ctx = SessionContext() - batch = pa.RecordBatch.from_arrays( - [ - pa.array([None, 1, None, 4]), - pa.array([10, 20, 30, 40]), - pa.array([100, 200, 300, 400]), - ], - names=["a", "b", "c"], - ) - df_test = ctx.create_dataframe([[batch]]) - - # nvl2 returns b when a is not null, c when a is null - result = df_test.select( - f.nvl2(column("a"), column("b"), column("c")).alias("result") - ).collect()[0] - assert result.column(0) == pa.array([100, 20, 300, 40], type=pa.int64()) - - # Test with string columns - batch2 = pa.RecordBatch.from_arrays( - [ - pa.array(["x", None]), - pa.array(["not_null", "not_null"]), - pa.array(["is_null", "is_null"]), - ], - names=["a", "b", "c"], - ) - df_test2 = ctx.create_dataframe([[batch2]]) - result = df_test2.select( - f.nvl2(column("a"), column("b"), column("c")).alias("result") - ).collect()[0] - assert result.column(0).to_pylist() == ["not_null", "is_null"] - - -def test_ifnull(): - ctx = SessionContext() - batch = pa.RecordBatch.from_arrays( - [ - pa.array([None, 1, None, 4]), - pa.array([10, 20, 30, 40]), + datetime(2025, 6, 15, tzinfo=DEFAULT_TZ), + None, + ], + type=pa.timestamp("us", tz="UTC"), + ), + pa.array( + [ + datetime(2022, 7, 4, tzinfo=DEFAULT_TZ), + datetime(2023, 12, 25, tzinfo=DEFAULT_TZ), + None, + None, + ], + type=pa.timestamp("us", tz="UTC"), + ), + pa.array([True, None, False, None], type=pa.bool_()), + pa.array([False, True, None, None], type=pa.bool_()), ], - names=["a", "b"], + names=["a", "b", "c", "d", "e", "g", "h", "i", "k", "l"], ) - df_test = ctx.create_dataframe([[batch]]) + return ctx.create_dataframe([[batch]]) - # ifnull returns a when a is not null, b when a is null (same as nvl) - result = df_test.select( - f.ifnull(column("a"), column("b")).alias("result") - ).collect()[0] - assert result.column(0) == pa.array([10, 1, 30, 4], type=pa.int64()) - # Verify ifnull matches nvl behavior - result_nvl = df_test.select( - f.nvl(column("a"), column("b")).alias("nvl_result") - ).collect()[0] - assert result.column(0) == result_nvl.column(0) +@pytest.mark.parametrize( + ("expr", "expected"), + [ + pytest.param( + f.greatest(column("a"), column("b")), + pa.array([5, 10, 3, None], type=pa.int64()), + id="greatest_int", + ), + pytest.param( + f.greatest(column("d"), column("e")), + pa.array(["banana", "date", "cherry", None], type=pa.utf8()), + id="greatest_str", + ), + pytest.param( + f.least(column("a"), column("b")), + pa.array([1, 10, 3, None], type=pa.int64()), + id="least_int", + ), + pytest.param( + f.least(column("d"), column("e")), + pa.array(["apple", "date", "cherry", None], type=pa.utf8()), + id="least_str", + ), + pytest.param( + f.coalesce(column("a"), column("b"), column("c")), + pa.array([1, 10, 3, None], type=pa.int64()), + id="coalesce_int", + ), + pytest.param( + f.coalesce(column("d"), column("e"), column("g")), + pa.array(["apple", "date", "cherry", None], type=pa.utf8()), + id="coalesce_str", + ), + pytest.param( + f.nvl(column("a"), column("c")), + pa.array([1, 30, 3, None], type=pa.int64()), + id="nvl_int", + ), + pytest.param( + f.nvl(column("d"), column("g")), + pa.array(["apple", "y", "cherry", None], type=pa.utf8()), + id="nvl_str", + ), + pytest.param( + f.ifnull(column("a"), column("c")), + pa.array([1, 30, 3, None], type=pa.int64()), + id="ifnull_int", + ), + pytest.param( + f.ifnull(column("d"), column("g")), + pa.array(["apple", "y", "cherry", None], type=pa.utf8()), + id="ifnull_str", + ), + pytest.param( + f.nvl2(column("a"), column("b"), column("c")), + pa.array([5, 30, None, None], type=pa.int64()), + id="nvl2_int", + ), + pytest.param( + f.nvl2(column("d"), column("e"), column("g")), + pa.array(["banana", "y", None, None], type=pa.utf8()), + id="nvl2_str", + ), + pytest.param( + f.nullif(column("a"), column("b")), + pa.array([1, None, 3, None], type=pa.int64()), + id="nullif_int", + ), + pytest.param( + f.nullif(column("d"), column("e")), + pa.array(["apple", None, "cherry", None], type=pa.utf8()), + id="nullif_str", + ), + pytest.param( + f.nullif(column("a"), literal(1)), + pa.array([None, None, 3, None], type=pa.int64()), + id="nullif_equal_values", + ), + pytest.param( + f.greatest(column("a"), column("b"), column("c")), + pa.array([20, 30, 40, None], type=pa.int64()), + id="greatest_variadic", + ), + pytest.param( + f.least(column("a"), column("b"), column("c")), + pa.array([1, 10, 3, None], type=pa.int64()), + id="least_variadic", + ), + pytest.param( + f.greatest(column("a"), literal(2)), + pa.array([2, 2, 3, 2], type=pa.int64()), + id="greatest_literal", + ), + pytest.param( + f.least(column("a"), literal(2)), + pa.array([1, 2, 2, 2], type=pa.int64()), + id="least_literal", + ), + pytest.param( + f.coalesce(column("a"), literal(0)), + pa.array([1, 0, 3, 0], type=pa.int64()), + id="coalesce_literal_int", + ), + pytest.param( + f.coalesce(column("d"), literal("default")), + pa.array(["apple", "default", "cherry", "default"], type=pa.string_view()), + id="coalesce_literal_str", + ), + pytest.param( + f.nvl(column("a"), literal(99)), + pa.array([1, 99, 3, 99], type=pa.int64()), + id="nvl_literal", + ), + pytest.param( + f.ifnull(column("d"), literal("unknown")), + pa.array(["apple", "unknown", "cherry", "unknown"], type=pa.string_view()), + id="ifnull_literal", + ), + pytest.param( + f.nvl2(column("a"), literal(1), literal(0)), + pa.array([1, 0, 1, 0], type=pa.int64()), + id="nvl2_literal", + ), + pytest.param( + f.greatest(column("h"), column("i")), + pa.array( + [ + datetime(2022, 7, 4, tzinfo=DEFAULT_TZ), + datetime(2023, 12, 25, tzinfo=DEFAULT_TZ), + datetime(2025, 6, 15, tzinfo=DEFAULT_TZ), + None, + ], + type=pa.timestamp("us", tz="UTC"), + ), + id="greatest_datetime", + ), + pytest.param( + f.least(column("h"), column("i")), + pa.array( + [ + datetime(2020, 1, 1, tzinfo=DEFAULT_TZ), + datetime(2023, 12, 25, tzinfo=DEFAULT_TZ), + datetime(2025, 6, 15, tzinfo=DEFAULT_TZ), + None, + ], + type=pa.timestamp("us", tz="UTC"), + ), + id="least_datetime", + ), + pytest.param( + f.coalesce(column("h"), column("i")), + pa.array( + [ + datetime(2020, 1, 1, tzinfo=DEFAULT_TZ), + datetime(2023, 12, 25, tzinfo=DEFAULT_TZ), + datetime(2025, 6, 15, tzinfo=DEFAULT_TZ), + None, + ], + type=pa.timestamp("us", tz="UTC"), + ), + id="coalesce_datetime", + ), + pytest.param( + f.nvl(column("k"), column("l")), + pa.array([True, True, False, None], type=pa.bool_()), + id="nvl_bool", + ), + pytest.param( + f.coalesce(column("k"), column("l")), + pa.array([True, True, False, None], type=pa.bool_()), + id="coalesce_bool", + ), + pytest.param( + f.nvl2(column("k"), column("k"), column("l")), + pa.array([True, True, False, None], type=pa.bool_()), + id="nvl2_bool", + ), + pytest.param( + f.coalesce( + column("h"), + literal(datetime(2000, 1, 1, tzinfo=DEFAULT_TZ)), + ), + pa.array( + [ + datetime(2020, 1, 1, tzinfo=DEFAULT_TZ), + datetime(2000, 1, 1, tzinfo=DEFAULT_TZ), + datetime(2025, 6, 15, tzinfo=DEFAULT_TZ), + datetime(2000, 1, 1, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("us", tz="UTC"), + ), + id="coalesce_literal_datetime", + ), + pytest.param( + f.coalesce(column("k"), literal(value=False)), + pa.array([True, False, False, False], type=pa.bool_()), + id="coalesce_literal_bool", + ), + pytest.param( + f.coalesce(column("a"), literal(None), literal(99)), + pa.array([1, 99, 3, 99], type=pa.int64()), + id="coalesce_skip_null_literal", + ), + ], +) +def test_conditional_functions(df_with_nulls, expr, expected): + result = df_with_nulls.select(expr.alias("result")).collect()[0] + assert result.column(0) == expected