Skip to content

Commit 6f2f360

Browse files
fix: fix youtube video reading (#5126)
## Changes Made YouTube streaming was not working, and the example in the docs was broken. This is a dumber but more robust approach to downloading and processing youtube videos. ## Related Issues n/a ## Checklist - [x] Documented in API Docs (if applicable) - [x] Documented in User Guide (if applicable) - [x] If adding a new documentation page, doc is added to `docs/mkdocs.yml` navigation - [x] Documentation builds and is formatted properly (tag @/ccmao1130 for docs review) --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent 3adbdba commit 6f2f360

File tree

2 files changed

+104
-24
lines changed

2 files changed

+104
-24
lines changed

daft/io/av/_read_video_frames.py

Lines changed: 61 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from __future__ import annotations
22

3+
import os
4+
import tempfile
5+
from contextlib import contextmanager
36
from dataclasses import dataclass
47
from typing import TYPE_CHECKING, Any
58
from urllib.parse import urlparse
@@ -131,6 +134,7 @@ def schema(self) -> Schema:
131134
)
132135

133136
def _list_frames(self, path: str, file: Any) -> Generator[_VideoFrame]:
137+
container = None
134138
try:
135139
container = av.open(file)
136140

@@ -147,8 +151,6 @@ def _list_frames(self, path: str, file: Any) -> Generator[_VideoFrame]:
147151
frame = next(container.decode(stream))
148152
except StopIteration:
149153
break
150-
except Exception:
151-
continue # skip decoding errors
152154

153155
if self.is_key_frame is not None and self.is_key_frame != frame.key_frame:
154156
continue # skip based on is_key_frame filter
@@ -171,28 +173,70 @@ def _list_frames(self, path: str, file: Any) -> Generator[_VideoFrame]:
171173
)
172174
frame_index += 1
173175
finally:
174-
container.close()
176+
if container:
177+
container.close()
175178

176179
def _open(self) -> Any:
177180
if _is_youtube_url(self.path):
178-
import requests
179-
import yt_dlp
180-
181-
with yt_dlp.YoutubeDL({"format": "mp4", "quiet": True}) as ydl:
182-
info = ydl.extract_info(self.path, download=False)
183-
if "url" in info:
184-
direct_url = info["url"]
185-
elif "entries" in info and len(info["entries"]) > 0 and "url" in info["entries"][0]:
186-
direct_url = info["entries"][0]["url"]
187-
else:
188-
raise ValueError("Could not extract URL from youtube video.")
189-
response = requests.get(direct_url, stream=True)
190-
response.raise_for_status()
191-
return response.raw
181+
return self._open_youtube_file()
192182
else:
193183
fp, fs, _ = _infer_filesystem(self.path, io_config=self.io_config)
194184
return fs.open_input_file(fp)
195185

186+
@contextmanager
187+
def _open_youtube_file(self) -> Any:
188+
import yt_dlp
189+
190+
def selector(ctx): # type: ignore
191+
best_fmt = None
192+
best_fit = float("inf") # lower is better
193+
194+
for fmt in ctx["formats"]:
195+
if fmt.get("ext") != "mp4":
196+
continue
197+
198+
h, w = fmt.get("height"), fmt.get("width")
199+
if h is None or w is None:
200+
continue
201+
202+
fit = abs(h - self.image_height) + abs(w - self.image_width)
203+
if fit == 0:
204+
yield fmt
205+
return
206+
if fit < best_fit:
207+
best_fmt = fmt
208+
best_fit = fit
209+
210+
yield best_fmt
211+
212+
# Note:
213+
# Streaming youtube downloads requires deeper work and was error-prone.
214+
# The parsed youtube urls use m3u8 which requires decoding which ydl
215+
# will handle for us. We cannot reliable pass a file-like HTTP response
216+
# to PyAV; hence why this will download to a tempfile then open the file.
217+
218+
temp_file = next(tempfile._get_candidate_names()) # type: ignore
219+
220+
params = {
221+
"format": selector,
222+
"quiet": True,
223+
"outtmpl": temp_file,
224+
"no_warnings": True,
225+
"extract_flat": False,
226+
"no_check_certificate": True,
227+
"ignoreerrors": False,
228+
"consoletitle": False,
229+
"noprogress": True,
230+
}
231+
232+
try:
233+
with yt_dlp.YoutubeDL(params) as ydl:
234+
ydl.download([self.path])
235+
yield open(temp_file, mode="rb")
236+
finally:
237+
if os.path.exists(temp_file):
238+
os.remove(temp_file)
239+
196240
def get_micro_partitions(self) -> Iterator[MicroPartition]:
197241
with self._open() as file:
198242
buffer = _VideoFramesBuffer(

docs/modalities/videos.md

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,47 @@ This example shows reading a video's frames into a DataFrame.
5757

5858
This example shows reading the key frames of a youtube video, you can also pass in a list of video urls.
5959

60-
```python
61-
df = daft.read_video_frames(
62-
path="https://www.youtube.com/watch?v=jNQXAC9IVRw",
63-
image_height=480,
64-
image_width=640,
65-
is_key_frame=True,
66-
)
60+
=== "🐍 Python"
61+
62+
```python
63+
import daft
64+
65+
df = daft.read_video_frames(
66+
path=[
67+
"https://www.youtube.com/watch?v=jNQXAC9IVRw",
68+
"https://www.youtube.com/watch?v=N2rZxCrb7iU",
69+
"https://www.youtube.com/watch?v=TF6cnLnEARo",
70+
],
71+
image_height=480,
72+
image_width=640,
73+
is_key_frame=True,
74+
)
75+
76+
df.show()
77+
```
78+
79+
```{title="Output"}
80+
╭────────────────────────────────┬─────────────┬───────────────────┬─────────────────┬───────────┬───────────┬────────────────┬──────────────┬───────────────────────╮
81+
│ path ┆ frame_index ┆ frame_time ┆ frame_time_base ┆ frame_pts ┆ frame_dts ┆ frame_duration ┆ is_key_frame ┆ data │
82+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
83+
│ Utf8 ┆ Int64 ┆ Float64 ┆ Utf8 ┆ Int64 ┆ Int64 ┆ Int64 ┆ Boolean ┆ Image[RGB; 480 x 640] │
84+
╞════════════════════════════════╪═════════════╪═══════════════════╪═════════════════╪═══════════╪═══════════╪════════════════╪══════════════╪═══════════════════════╡
85+
│ https://www.youtube.com/watch… ┆ 0 ┆ 0 ┆ 1/90000 ┆ 0 ┆ 0 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
86+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
87+
│ https://www.youtube.com/watch… ┆ 1 ┆ 6.8068 ┆ 1/90000 ┆ 612612 ┆ 612612 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
88+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
89+
│ https://www.youtube.com/watch… ┆ 2 ┆ 13.2132 ┆ 1/90000 ┆ 1189188 ┆ 1189188 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
90+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
91+
│ https://www.youtube.com/watch… ┆ 3 ┆ 18.018 ┆ 1/90000 ┆ 1621620 ┆ 1621620 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
92+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
93+
│ https://www.youtube.com/watch… ┆ 4 ┆ 24.8248 ┆ 1/90000 ┆ 2234232 ┆ 2234232 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
94+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
95+
│ https://www.youtube.com/watch… ┆ 5 ┆ 30.03 ┆ 1/90000 ┆ 2702700 ┆ 2702700 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
96+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
97+
│ https://www.youtube.com/watch… ┆ 6 ┆ 36.36966666666667 ┆ 1/90000 ┆ 3273270 ┆ 3273270 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
98+
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
99+
│ https://www.youtube.com/watch… ┆ 7 ┆ 43.27656666666667 ┆ 1/90000 ┆ 3894891 ┆ 3894891 ┆ 3003 ┆ true ┆ <FixedShapeImage> │
100+
╰────────────────────────────────┴─────────────┴───────────────────┴─────────────────┴───────────┴───────────┴────────────────┴──────────────┴───────────────────────╯
101+
102+
(Showing first 8 rows)
67103
```

0 commit comments

Comments
 (0)