feat: unnest param on @daft.func (#5132)

kevinzwang · web-flow · commit 6a8767cb7893 · 2025-09-04T19:28:41.000Z
## Changes Made Added the `unnest` argument on `@daft.func`, allowing for the automatic unnesting of multiple return values. Example: ```py >>> import daft >>> from daft import DataType >>> >>> @daft.func(return_dtype=DataType.struct({"int": DataType.int64(), "str": DataType.string()}), unnest=True) ... def my_multi_return(val: int): ... return {"int": val * 2, "str": str(val) * 2} >>> df = daft.from_pydict({"x": [1, 2, 3]}) >>> df.select(my_multi_return(df["x"])).collect() ╭───────┬──────╮ │ int ┆ str │ │ --- ┆ --- │ │ Int64 ┆ Utf8 │ ╞═══════╪══════╡ │ 2 ┆ 11 │ ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ │ 4 ┆ 22 │ ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ │ 6 ┆ 33 │ ╰───────┴──────╯ (Showing first 3 of 3 rows) ``` ## Related Issues  ## Checklist - [x] Documented in API Docs (if applicable) - [x] Documented in User Guide (if applicable) - [x] If adding a new documentation page, doc is added to `docs/mkdocs.yml` navigation - [x] Documentation builds and is formatted properly (tag @/ccmao1130 for docs review)
diff --git a/daft/udf/__init__.py b/daft/udf/__init__.py
@@ -27,6 +27,7 @@ class _PartialUdf:
     """Helper class to provide typing overloads for using `daft.func` as a decorator."""
 
     return_dtype: DataTypeLike | None
+    unnest: bool
 
     @overload
     def __call__(self, fn: Callable[P, Iterator[T]]) -> GeneratorUdf[P, T]: ...  # type: ignore[overload-overlap]
@@ -35,9 +36,9 @@ def __call__(self, fn: Callable[P, T]) -> RowWiseUdf[P, T]: ...
 
     def __call__(self, fn: Callable[P, Any]) -> GeneratorUdf[P, Any] | RowWiseUdf[P, Any]:
         if isgeneratorfunction(fn):
-            return GeneratorUdf(fn, return_dtype=self.return_dtype)
+            return GeneratorUdf(fn, return_dtype=self.return_dtype, unnest=self.unnest)
         else:
-            return RowWiseUdf(fn, return_dtype=self.return_dtype)
+            return RowWiseUdf(fn, return_dtype=self.return_dtype, unnest=self.unnest)
 
 
 class _DaftFuncDecorator:
@@ -54,6 +55,7 @@ class _DaftFuncDecorator:
 
     Args:
         return_dtype: The data type that this function should return or yield. If not specified, it is derived from the function's return type hint.
+        unnest: Whether to unnest/flatten out return type fields into columns. Return dtype must be `DataType.struct` when this is set to true. Defaults to false.
 
     Examples:
         Basic Example
@@ -184,21 +186,46 @@ class _DaftFuncDecorator:
         ╰───────┴─────────╯
         <BLANKLINE>
         (Showing first 7 of 7 rows)
+
+        Unnesting multiple return fields
+
+        >>> import daft
+        >>> from daft import DataType
+        >>> @daft.func(return_dtype=DataType.struct({"int": DataType.int64(), "str": DataType.string()}), unnest=True)
+        ... def my_multi_return(val: int):
+        ...     return {"int": val * 2, "str": str(val) * 2}
+        >>> df = daft.from_pydict({"x": [1, 2, 3]})
+        >>> df.select(my_multi_return(df["x"])).collect()
+        ╭───────┬──────╮
+        │ int   ┆ str  │
+        │ ---   ┆ ---  │
+        │ Int64 ┆ Utf8 │
+        ╞═══════╪══════╡
+        │ 2     ┆ 11   │
+        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+        │ 4     ┆ 22   │
+        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+        │ 6     ┆ 33   │
+        ╰───────┴──────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
     """
 
     @overload
-    def __new__(cls, *, return_dtype: DataTypeLike | None = None) -> _PartialUdf: ...  # type: ignore[misc]
+    def __new__(cls, *, return_dtype: DataTypeLike | None = None, unnest: bool = False) -> _PartialUdf: ...  # type: ignore[misc]
     @overload
     def __new__(  # type: ignore[misc]
-        cls, fn: Callable[P, Iterator[T]], *, return_dtype: DataTypeLike | None = None
+        cls, fn: Callable[P, Iterator[T]], *, return_dtype: DataTypeLike | None = None, unnest: bool = False
     ) -> GeneratorUdf[P, T]: ...
     @overload
-    def __new__(cls, fn: Callable[P, T], *, return_dtype: DataTypeLike | None = None) -> RowWiseUdf[P, T]: ...  # type: ignore[misc]
+    def __new__(  # type: ignore[misc]
+        cls, fn: Callable[P, T], *, return_dtype: DataTypeLike | None = None, unnest: bool = False
+    ) -> RowWiseUdf[P, T]: ...
 
     def __new__(  # type: ignore[misc]
-        cls, fn: Callable[P, Any] | None = None, *, return_dtype: DataTypeLike | None = None
+        cls, fn: Callable[P, Any] | None = None, *, return_dtype: DataTypeLike | None = None, unnest: bool = False
     ) -> _PartialUdf | GeneratorUdf[P, Any] | RowWiseUdf[P, Any]:
-        partial_udf = _PartialUdf(return_dtype=return_dtype)
+        partial_udf = _PartialUdf(return_dtype=return_dtype, unnest=unnest)
         return partial_udf if fn is None else partial_udf(fn)
 
 
diff --git a/daft/udf/generator.py b/daft/udf/generator.py
@@ -32,9 +32,10 @@ class GeneratorUdf(Generic[P, T]):
     If no values are yielded for an input, a null value is inserted.
     """
 
-    def __init__(self, fn: Callable[P, Iterator[T]], return_dtype: DataTypeLike | None):
+    def __init__(self, fn: Callable[P, Iterator[T]], return_dtype: DataTypeLike | None, unnest: bool):
         self._inner = fn
         self.name = get_unique_function_name(fn)
+        self.unnest = unnest
 
         # attempt to extract return type from an Iterator or Generator type hint
         if return_dtype is None:
@@ -56,6 +57,11 @@ def __init__(self, fn: Callable[P, Iterator[T]], return_dtype: DataTypeLike | No
             return_dtype = args[0]
         self.return_dtype = DataType._infer_type(return_dtype)
 
+        if self.unnest and not self.return_dtype.is_struct():
+            raise ValueError(
+                f"Expected Daft function `return_dtype` to be `DataType.struct` when `unnest=True`, instead found: {self.return_dtype}"
+            )
+
     @overload
     def __call__(self, *args: P.args, **kwargs: P.kwargs) -> Iterator[T]: ...
     @overload
@@ -78,6 +84,11 @@ def inner_rowwise(*args: P.args, **kwargs: P.kwargs) -> list[T]:
 
         return_dtype_rowwise = DataType.list(self.return_dtype)
 
-        return Expression._from_pyexpr(
+        expr = Expression._from_pyexpr(
             row_wise_udf(self.name, inner_rowwise, return_dtype_rowwise._dtype, (args, kwargs), expr_args)
         ).explode()
+
+        if self.unnest:
+            expr = expr.unnest()
+
+        return expr
diff --git a/daft/udf/row_wise.py b/daft/udf/row_wise.py
@@ -33,9 +33,10 @@ class RowWiseUdf(Generic[P, T]):
     Row-wise functions are called with data from one row at a time, and map that to a single output value for that row.
     """
 
-    def __init__(self, fn: Callable[P, T], return_dtype: DataTypeLike | None):
+    def __init__(self, fn: Callable[P, T], return_dtype: DataTypeLike | None, unnest: bool):
         self._inner = fn
         self.name = get_unique_function_name(fn)
+        self.unnest = unnest
 
         if return_dtype is None:
             type_hints = get_type_hints(fn)
@@ -47,6 +48,11 @@ def __init__(self, fn: Callable[P, T], return_dtype: DataTypeLike | None):
             return_dtype = type_hints["return"]
         self.return_dtype = DataType._infer_type(return_dtype)
 
+        if self.unnest and not self.return_dtype.is_struct():
+            raise ValueError(
+                f"Expected Daft function `return_dtype` to be `DataType.struct` when `unnest=True`, instead found: {self.return_dtype}"
+            )
+
     @overload
     def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T: ...
     @overload
@@ -62,10 +68,15 @@ def __call__(self, *args: Any, **kwargs: Any) -> Expression | T:
         if len(expr_args) == 0:
             return self._inner(*args, **kwargs)
 
-        return Expression._from_pyexpr(
+        expr = Expression._from_pyexpr(
             row_wise_udf(self.name, self._inner, self.return_dtype._dtype, (args, kwargs), expr_args)
         )
 
+        if self.unnest:
+            expr = expr.unnest()
+
+        return expr
+
 
 def __call_async_batch(
     fn: Callable[..., Awaitable[Any]],
diff --git a/tests/udf/test_generator_udf.py b/tests/udf/test_generator_udf.py
@@ -3,6 +3,8 @@
 import collections.abc
 import typing
 
+import pytest
+
 import daft
 
 
@@ -85,3 +87,44 @@ def my_gen_func(input: int) -> collections.abc.Generator[str, None, None]:
     df = df.select(my_gen_func(df["input"]).alias("output"))
 
     assert df.schema() == daft.Schema.from_pydict({"output": daft.DataType.string()})
+
+
+def test_generator_udf_unnest():
+    @daft.func(
+        return_dtype=daft.DataType.struct({"id": daft.DataType.int64(), "value": daft.DataType.string()}), unnest=True
+    )
+    def create_records(count: int, base_value: str):
+        for i in range(count):
+            yield {"id": i, "value": f"{base_value}_{i}"}
+
+    df = daft.from_pydict({"count": [2, 3, 1], "base": ["a", "b", "c"]})
+    result = df.select(create_records(df["count"], df["base"])).to_pydict()
+
+    expected = {"id": [0, 1, 0, 1, 2, 0], "value": ["a_0", "a_1", "b_0", "b_1", "b_2", "c_0"]}
+    assert result == expected
+
+
+def test_generator_udf_unnest_empty_generator():
+    @daft.func(
+        return_dtype=daft.DataType.struct({"x": daft.DataType.int64(), "y": daft.DataType.string()}), unnest=True
+    )
+    def empty_gen(n: int):
+        if n > 0:
+            yield {"x": n, "y": str(n)}
+
+    df = daft.from_pydict({"n": [0, 1, 2]})
+    result = df.select(empty_gen(df["n"])).to_pydict()
+
+    expected = {"x": [None, 1, 2], "y": [None, "1", "2"]}
+    assert result == expected
+
+
+def test_generator_udf_unnest_error_non_struct():
+    with pytest.raises(
+        ValueError, match="Expected Daft function `return_dtype` to be `DataType.struct` when `unnest=True`"
+    ):
+
+        @daft.func(return_dtype=daft.DataType.string(), unnest=True)
+        def invalid_unnest_generator(n: int):
+            for i in range(n):
+                yield str(i)
diff --git a/tests/udf/test_row_wise_udf.py b/tests/udf/test_row_wise_udf.py
@@ -123,3 +123,30 @@ async def my_async_stringify_and_sum(a: int, b: int) -> str:
     df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]})
     async_df = df.select(my_async_stringify_and_sum(col("x"), col("y")))
     assert async_df.to_pydict() == {"x": ["5", "7", "9"]}
+
+
+def test_row_wise_udf_unnest():
+    @daft.func(
+        return_dtype=daft.DataType.struct(
+            {"id": daft.DataType.int64(), "name": daft.DataType.string(), "score": daft.DataType.float64()}
+        ),
+        unnest=True,
+    )
+    def create_record(value: int):
+        return {"id": value, "name": f"item_{value}", "score": value * 1.5}
+
+    df = daft.from_pydict({"value": [1, 2, 3]})
+    result = df.select(create_record(col("value"))).to_pydict()
+
+    expected = {"id": [1, 2, 3], "name": ["item_1", "item_2", "item_3"], "score": [1.5, 3.0, 4.5]}
+    assert result == expected
+
+
+def test_row_wise_udf_unnest_error_non_struct():
+    with pytest.raises(
+        ValueError, match="Expected Daft function `return_dtype` to be `DataType.struct` when `unnest=True`"
+    ):
+
+        @daft.func(return_dtype=daft.DataType.int64(), unnest=True)
+        def invalid_unnest(a: int):
+            return a