Eventual-Inc
diff --git a/‎daft/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎daft/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎daft/ai/typing.py‎
Lines changed: 2 additions & 2 deletions b/‎daft/ai/typing.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎daft/filesystem.py‎
Lines changed: 7 additions & 6 deletions b/‎daft/filesystem.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎daft/io/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎daft/io/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎daft/io/av/__init__.py‎
Lines changed: 76 additions & 0 deletions b/‎daft/io/av/__init__.py‎
Lines changed: 76 additions & 0 deletions
@@ -88,6 +88,7 @@ def refresh_logger() -> None:
     read_parquet,
     read_sql,
     read_lance,
+    read_video_frames,
     read_warc,
 )
 from daft.series import Series
@@ -211,6 +212,7 @@ def refresh_logger() -> None:
     "read_parquet",
     "read_sql",
     "read_table",
+    "read_video_frames",
     "read_warc",
     "refresh_logger",
     "register_viz_hook",
 
@@ -2,7 +2,7 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Generic, TypeVar
+from typing import Any, Generic, TypeVar
 
 from daft.datatype import DataType
 from daft.dependencies import np
@@ -36,7 +36,7 @@ def instantiate(self) -> T:
 
 
 # temp definition to defer complexity of a more generic embedding type to later PRs
-Embedding = np.typing.NDArray  # type: ignore[type-arg]
+Embedding = np.typing.NDArray[Any]
 
 
 @dataclass
 
@@ -115,7 +115,7 @@ def _resolve_paths_and_filesystem(
     paths: str | pathlib.Path | list[str],
     io_config: IOConfig | None = None,
 ) -> tuple[list[str], pafs.FileSystem]:
-    """Resolves and normalizes the provided path and infers it's filesystem.
+    """Resolves and normalizes the provided path and infers its filesystem.
 
     Also ensures that the inferred filesystem is compatible with the passed filesystem, if provided.
 
@@ -200,6 +200,8 @@ def _infer_filesystem(
     """
     protocol = get_protocol_from_path(path)
     translated_kwargs: dict[str, Any]
+    resolved_filesystem: pafs.FileSystem
+    expiry: datetime | None = None
 
     def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None:
         """Helper method used when setting kwargs for pyarrow."""
@@ -228,7 +230,6 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None:
                 except ImportError:
                     pass  # Config does not exist in pyarrow 7.0.0
 
-            expiry = None
             if (s3_creds := s3_config.provide_cached_credentials()) is not None:
                 _set_if_not_none(translated_kwargs, "access_key", s3_creds.key_id)
                 _set_if_not_none(translated_kwargs, "secret_key", s3_creds.access_key)
@@ -277,8 +278,8 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None:
     elif protocol in {"http", "https"}:
         fsspec_fs_cls = fsspec.get_filesystem_class(protocol)
         fsspec_fs = fsspec_fs_cls()
-        resolved_filesystem, resolved_path = pafs._resolve_filesystem_and_path(path, fsspec_fs)
-        resolved_path = resolved_filesystem.normalize_path(resolved_path)
+        resolved_filesystem = pafs.PyFileSystem(fsspec_fs)
+        resolved_path = resolved_filesystem.normalize_path(_unwrap_protocol(path))
         return resolved_path, resolved_filesystem, None
 
     ###
@@ -300,8 +301,8 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None:
             )
         else:
             fsspec_fs = fsspec_fs_cls()
-        resolved_filesystem, resolved_path = pafs._resolve_filesystem_and_path(path, fsspec_fs)
-        resolved_path = resolved_filesystem.normalize_path(_unwrap_protocol(resolved_path))
+        resolved_filesystem = pafs.PyFileSystem(fsspec_fs)
+        resolved_path = resolved_filesystem.normalize_path(_unwrap_protocol(path))
         return resolved_path, resolved_filesystem, None
 
     else:
 
@@ -23,6 +23,7 @@
 from daft.io.file_path import from_glob_path
 from daft.io.sink import DataSink
 from daft.io.source import DataSource, DataSourceTask
+from daft.io.av import read_video_frames
 
 __all__ = [
     "AzureConfig",
@@ -47,5 +48,6 @@
     "read_lance",
     "read_parquet",
     "read_sql",
+    "read_video_frames",
     "read_warc",
 ]
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from daft.dataframe.dataframe import DataFrame
+    from daft.daft import IOConfig
+
+__all__ = [
+    # TODO: additional video support
+    # "read_audio_frames",
+    # "read_audio_streams",
+    # "read_audio_streams_metadata",
+    # "read_subtitle_frames",
+    # "read_subtitle_streams",
+    # "read_subtitle_streams_metadata",
+    "read_video_frames",
+    # "read_video_streams",
+    # "read_video_streams_metadata",
+]
+
+
+def read_video_frames(
+    path: str | list[str],
+    image_height: int,
+    image_width: int,
+    is_key_frame: bool | None = None,
+    io_config: IOConfig | None = None,
+) -> DataFrame:
+    """Creates a DataFrame by reading the frames of one or more video files.
+
+    This produces a DataFrame with the following fields:
+        * path (string): path to the video file that produced this frame.
+        * frame_index (int): frame index in the video.
+        * frame_time (float): frame time in fractional seconds as a floating point.
+        * frame_time_base (str): fractional unit of seconds in which timestamps are expressed.
+        * frame_pts (int): frame presentation timestamp in time_base units.
+        * frame_dts (int): frame decoding timestamp in time_base units.
+        * frame_duration (int): frame duration in time_base units.
+        * is_key_frame (bool): true iff this is a key frame.
+
+    Warning:
+        This requires PyAV which can be installed with `pip install av`.
+
+    Note:
+        This function will stream the frames from all videos as a DataFrame of images.
+        If you wish to load an entire video into a single row, this can be done with
+        read_glob_path and url.download.
+
+    Args:
+        path (str|list[str]): Path(s) to the video file(s) which allows wildcards.
+        image_height (int): Height to which each frame will be resized.
+        image_width (int): Width to which each frame will be resized.
+        is_key_frame (bool|None): If True, only include key frames; if False, only non-key frames; if None, include all frames.
+        io_config (IOConfig|None): Optional IOConfig.
+
+    Returns:
+        DataFrame: dataframe of images.
+
+    Examples:
+        >>> df = daft.read_video_frames("/path/to/file.mp4", image_height=480, image_width=640)
+        >>> df = daft.read_video_frames("/path/to/directory", image_height=480, image_width=640)
+        >>> df = daft.read_video_frames("/path/to/files-*.mp4", image_height=480, image_width=640)
+        >>> df = daft.read_video_frames("s3://path/to/files-*.mp4", image_height=480, image_width=640)
+    """
+    try:
+        from daft.io.av._read_video_frames import _VideoFramesSource
+    except ImportError as e:
+        raise ImportError("read_video_frames requires PyAV. Please install it with `pip install av`.") from e
+    return _VideoFramesSource(
+        paths=[path] if isinstance(path, str) else path,
+        image_height=image_height,
+        image_width=image_width,
+        is_key_frame=is_key_frame,
+        io_config=io_config,
+    ).read()