Skip to content

Commit 8b5abd9

Browse files
CarolinePascalasafxrev
authored andcommitted
feat(encoding): switching to PyAV for ffmpeg related tasks (huggingface#983)
1 parent 38075be commit 8b5abd9

File tree

3 files changed

+124
-103
lines changed

3 files changed

+124
-103
lines changed

benchmarks/video/run_video_benchmark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ def main(
416416
"--vcodec",
417417
type=str,
418418
nargs="*",
419-
default=["libx264", "libx265", "libsvtav1"],
419+
default=["libx264", "hevc", "libsvtav1"],
420420
help="Video codecs to be tested",
421421
)
422422
parser.add_argument(
@@ -446,7 +446,7 @@ def main(
446446
# nargs="*",
447447
# default=[0, 1],
448448
# help="Use the fastdecode tuning option. 0 disables it. "
449-
# "For libx264 and libx265, only 1 is possible. "
449+
# "For libx264 and libx265/hevc, only 1 is possible. "
450450
# "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
451451
# )
452452
parser.add_argument(

lerobot/common/datasets/video_utils.py

Lines changed: 121 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,15 @@
1313
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
16+
import glob
1617
import importlib
17-
import json
1818
import logging
19-
import subprocess
2019
import warnings
21-
from collections import OrderedDict
2220
from dataclasses import dataclass, field
2321
from pathlib import Path
2422
from typing import Any, ClassVar
2523

24+
import av
2625
import pyarrow as pa
2726
import torch
2827
import torchvision
@@ -252,51 +251,83 @@ def encode_video_frames(
252251
g: int | None = 2,
253252
crf: int | None = 30,
254253
fast_decode: int = 0,
255-
log_level: str | None = "error",
254+
log_level: int | None = av.logging.ERROR,
256255
overwrite: bool = False,
257256
) -> None:
258257
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
258+
# Check encoder availability
259+
if vcodec not in ["h264", "hevc", "libsvtav1"]:
260+
raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
261+
259262
video_path = Path(video_path)
260263
imgs_dir = Path(imgs_dir)
261-
video_path.parent.mkdir(parents=True, exist_ok=True)
262-
263-
ffmpeg_args = OrderedDict(
264-
[
265-
("-f", "image2"),
266-
("-r", str(fps)),
267-
("-i", str(imgs_dir / "frame_%06d.png")),
268-
("-vcodec", vcodec),
269-
("-pix_fmt", pix_fmt),
270-
]
264+
265+
video_path.parent.mkdir(parents=True, exist_ok=overwrite)
266+
267+
# Encoders/pixel formats incompatibility check
268+
if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
269+
logging.warning(
270+
f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
271+
)
272+
pix_fmt = "yuv420p"
273+
274+
# Get input frames
275+
template = "frame_" + ("[0-9]" * 6) + ".png"
276+
input_list = sorted(
277+
glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
271278
)
272279

280+
# Define video output frame size (assuming all input frames are the same size)
281+
if len(input_list) == 0:
282+
raise FileNotFoundError(f"No images found in {imgs_dir}.")
283+
dummy_image = Image.open(input_list[0])
284+
width, height = dummy_image.size
285+
286+
# Define video codec options
287+
video_options = {}
288+
273289
if g is not None:
274-
ffmpeg_args["-g"] = str(g)
290+
video_options["g"] = str(g)
275291

276292
if crf is not None:
277-
ffmpeg_args["-crf"] = str(crf)
293+
video_options["crf"] = str(crf)
278294

279295
if fast_decode:
280-
key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
296+
key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
281297
value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
282-
ffmpeg_args[key] = value
298+
video_options[key] = value
283299

300+
# Set logging level
284301
if log_level is not None:
285-
ffmpeg_args["-loglevel"] = str(log_level)
286-
287-
ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
288-
if overwrite:
289-
ffmpeg_args.append("-y")
290-
291-
ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
292-
# redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
293-
subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
302+
# "While less efficient, it is generally preferable to modify logging with Python’s logging"
303+
logging.getLogger("libav").setLevel(log_level)
304+
305+
# Create and open output file (overwrite by default)
306+
with av.open(str(video_path), "w") as output:
307+
output_stream = output.add_stream(vcodec, fps, options=video_options)
308+
output_stream.pix_fmt = pix_fmt
309+
output_stream.width = width
310+
output_stream.height = height
311+
312+
# Loop through input frames and encode them
313+
for input_data in input_list:
314+
input_image = Image.open(input_data).convert("RGB")
315+
input_frame = av.VideoFrame.from_image(input_image)
316+
packet = output_stream.encode(input_frame)
317+
if packet:
318+
output.mux(packet)
319+
320+
# Flush the encoder
321+
packet = output_stream.encode()
322+
if packet:
323+
output.mux(packet)
324+
325+
# Reset logging level
326+
if log_level is not None:
327+
av.logging.restore_default_callback()
294328

295329
if not video_path.exists():
296-
raise OSError(
297-
f"Video encoding did not work. File not found: {video_path}. "
298-
f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
299-
)
330+
raise OSError(f"Video encoding did not work. File not found: {video_path}.")
300331

301332

302333
@dataclass
@@ -332,78 +363,68 @@ def __call__(self):
332363

333364

334365
def get_audio_info(video_path: Path | str) -> dict:
335-
ffprobe_audio_cmd = [
336-
"ffprobe",
337-
"-v",
338-
"error",
339-
"-select_streams",
340-
"a:0",
341-
"-show_entries",
342-
"stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
343-
"-of",
344-
"json",
345-
str(video_path),
346-
]
347-
result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
348-
if result.returncode != 0:
349-
raise RuntimeError(f"Error running ffprobe: {result.stderr}")
350-
351-
info = json.loads(result.stdout)
352-
audio_stream_info = info["streams"][0] if info.get("streams") else None
353-
if audio_stream_info is None:
354-
return {"has_audio": False}
355-
356-
# Return the information, defaulting to None if no audio stream is present
357-
return {
358-
"has_audio": True,
359-
"audio.channels": audio_stream_info.get("channels", None),
360-
"audio.codec": audio_stream_info.get("codec_name", None),
361-
"audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
362-
"audio.sample_rate": int(audio_stream_info["sample_rate"])
363-
if audio_stream_info.get("sample_rate")
364-
else None,
365-
"audio.bit_depth": audio_stream_info.get("bit_depth", None),
366-
"audio.channel_layout": audio_stream_info.get("channel_layout", None),
367-
}
366+
# Set logging level
367+
logging.getLogger("libav").setLevel(av.logging.ERROR)
368+
369+
# Getting audio stream information
370+
audio_info = {}
371+
with av.open(str(video_path), "r") as audio_file:
372+
try:
373+
audio_stream = audio_file.streams.audio[0]
374+
except IndexError:
375+
# Reset logging level
376+
av.logging.restore_default_callback()
377+
return {"has_audio": False}
378+
379+
audio_info["audio.channels"] = audio_stream.channels
380+
audio_info["audio.codec"] = audio_stream.codec.canonical_name
381+
# In an ideal loseless case : bit depth x sample rate x channels = bit rate.
382+
# In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
383+
audio_info["audio.bit_rate"] = audio_stream.bit_rate
384+
audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second
385+
# In an ideal loseless case : fixed number of bits per sample.
386+
# In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
387+
audio_info["audio.bit_depth"] = audio_stream.format.bits
388+
audio_info["audio.channel_layout"] = audio_stream.layout.name
389+
audio_info["has_audio"] = True
390+
391+
# Reset logging level
392+
av.logging.restore_default_callback()
393+
394+
return audio_info
368395

369396

370397
def get_video_info(video_path: Path | str) -> dict:
371-
ffprobe_video_cmd = [
372-
"ffprobe",
373-
"-v",
374-
"error",
375-
"-select_streams",
376-
"v:0",
377-
"-show_entries",
378-
"stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt",
379-
"-of",
380-
"json",
381-
str(video_path),
382-
]
383-
result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
384-
if result.returncode != 0:
385-
raise RuntimeError(f"Error running ffprobe: {result.stderr}")
386-
387-
info = json.loads(result.stdout)
388-
video_stream_info = info["streams"][0]
389-
390-
# Calculate fps from r_frame_rate
391-
r_frame_rate = video_stream_info["r_frame_rate"]
392-
num, denom = map(int, r_frame_rate.split("/"))
393-
fps = num / denom
394-
395-
pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"])
396-
397-
video_info = {
398-
"video.fps": fps,
399-
"video.height": video_stream_info["height"],
400-
"video.width": video_stream_info["width"],
401-
"video.channels": pixel_channels,
402-
"video.codec": video_stream_info["codec_name"],
403-
"video.pix_fmt": video_stream_info["pix_fmt"],
404-
"video.is_depth_map": False,
405-
**get_audio_info(video_path),
406-
}
398+
# Set logging level
399+
logging.getLogger("libav").setLevel(av.logging.ERROR)
400+
401+
# Getting video stream information
402+
video_info = {}
403+
with av.open(str(video_path), "r") as video_file:
404+
try:
405+
video_stream = video_file.streams.video[0]
406+
except IndexError:
407+
# Reset logging level
408+
av.logging.restore_default_callback()
409+
return {}
410+
411+
video_info["video.height"] = video_stream.height
412+
video_info["video.width"] = video_stream.width
413+
video_info["video.codec"] = video_stream.codec.canonical_name
414+
video_info["video.pix_fmt"] = video_stream.pix_fmt
415+
video_info["video.is_depth_map"] = False
416+
417+
# Calculate fps from r_frame_rate
418+
video_info["video.fps"] = int(video_stream.base_rate)
419+
420+
pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
421+
video_info["video.channels"] = pixel_channels
422+
423+
# Reset logging level
424+
av.logging.restore_default_callback()
425+
426+
# Adding audio stream information
427+
video_info.update(**get_audio_info(video_path))
407428

408429
return video_info
409430

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ dependencies = [
6262
"omegaconf>=2.3.0",
6363
"opencv-python-headless>=4.9.0",
6464
"packaging>=24.2",
65-
"av>=12.0.5",
65+
"av>=14.2.0",
6666
"pymunk>=6.6.0",
6767
"pynput>=1.7.7",
6868
"pyzmq>=26.2.1",

0 commit comments

Comments
 (0)