Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/video/run_video_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def main(
"--vcodec",
type=str,
nargs="*",
default=["libx264", "libx265", "libsvtav1"],
default=["libx264", "hevc", "libsvtav1"],
help="Video codecs to be tested",
)
parser.add_argument(
Expand Down Expand Up @@ -446,7 +446,7 @@ def main(
# nargs="*",
# default=[0, 1],
# help="Use the fastdecode tuning option. 0 disables it. "
# "For libx264 and libx265, only 1 is possible. "
# "For libx264 and libx265/hevc, only 1 is possible. "
# "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
# )
parser.add_argument(
Expand Down
221 changes: 121 additions & 100 deletions lerobot/common/datasets/video_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import importlib
import json
import logging
import subprocess
import warnings
from collections import OrderedDict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, ClassVar

import av
import pyarrow as pa
import torch
import torchvision
Expand Down Expand Up @@ -252,51 +251,83 @@ def encode_video_frames(
g: int | None = 2,
crf: int | None = 30,
fast_decode: int = 0,
log_level: str | None = "error",
log_level: int | None = av.logging.ERROR,
overwrite: bool = False,
) -> None:
"""More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
# Check encoder availability
if vcodec not in ["h264", "hevc", "libsvtav1"]:
raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")

video_path = Path(video_path)
imgs_dir = Path(imgs_dir)
video_path.parent.mkdir(parents=True, exist_ok=True)

ffmpeg_args = OrderedDict(
[
("-f", "image2"),
("-r", str(fps)),
("-i", str(imgs_dir / "frame_%06d.png")),
("-vcodec", vcodec),
("-pix_fmt", pix_fmt),
]

video_path.parent.mkdir(parents=True, exist_ok=overwrite)

# Encoders/pixel formats incompatibility check
if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
logging.warning(
f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
)
pix_fmt = "yuv420p"

# Get input frames
template = "frame_" + ("[0-9]" * 6) + ".png"
input_list = sorted(
glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
)

# Define video output frame size (assuming all input frames are the same size)
if len(input_list) == 0:
raise FileNotFoundError(f"No images found in {imgs_dir}.")
dummy_image = Image.open(input_list[0])
width, height = dummy_image.size

# Define video codec options
video_options = {}

if g is not None:
ffmpeg_args["-g"] = str(g)
video_options["g"] = str(g)

if crf is not None:
ffmpeg_args["-crf"] = str(crf)
video_options["crf"] = str(crf)

if fast_decode:
key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
ffmpeg_args[key] = value
video_options[key] = value

# Set logging level
if log_level is not None:
ffmpeg_args["-loglevel"] = str(log_level)

ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
if overwrite:
ffmpeg_args.append("-y")

ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
# redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
# "While less efficient, it is generally preferable to modify logging with Python’s logging"
logging.getLogger("libav").setLevel(log_level)

# Create and open output file (overwrite by default)
with av.open(str(video_path), "w") as output:
output_stream = output.add_stream(vcodec, fps, options=video_options)
output_stream.pix_fmt = pix_fmt
output_stream.width = width
output_stream.height = height

# Loop through input frames and encode them
for input_data in input_list:
input_image = Image.open(input_data).convert("RGB")
input_frame = av.VideoFrame.from_image(input_image)
packet = output_stream.encode(input_frame)
if packet:
output.mux(packet)

# Flush the encoder
packet = output_stream.encode()
if packet:
output.mux(packet)

# Reset logging level
if log_level is not None:
av.logging.restore_default_callback()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might inadvertently affect other parts of the codebase if PyAV is used elsewhere. For now this is not the case as this is the only file using it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could agree on a logging level for PyAV to be set globally for all the files using it ? I don't know if this is something we might want.


if not video_path.exists():
raise OSError(
f"Video encoding did not work. File not found: {video_path}. "
f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
)
raise OSError(f"Video encoding did not work. File not found: {video_path}.")


@dataclass
Expand Down Expand Up @@ -332,78 +363,68 @@ def __call__(self):


def get_audio_info(video_path: Path | str) -> dict:
ffprobe_audio_cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a:0",
"-show_entries",
"stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
"-of",
"json",
str(video_path),
]
result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
raise RuntimeError(f"Error running ffprobe: {result.stderr}")

info = json.loads(result.stdout)
audio_stream_info = info["streams"][0] if info.get("streams") else None
if audio_stream_info is None:
return {"has_audio": False}

# Return the information, defaulting to None if no audio stream is present
return {
"has_audio": True,
"audio.channels": audio_stream_info.get("channels", None),
"audio.codec": audio_stream_info.get("codec_name", None),
"audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
"audio.sample_rate": int(audio_stream_info["sample_rate"])
if audio_stream_info.get("sample_rate")
else None,
"audio.bit_depth": audio_stream_info.get("bit_depth", None),
"audio.channel_layout": audio_stream_info.get("channel_layout", None),
}
# Set logging level
logging.getLogger("libav").setLevel(av.logging.ERROR)

# Getting audio stream information
audio_info = {}
with av.open(str(video_path), "r") as audio_file:
try:
audio_stream = audio_file.streams.audio[0]
except IndexError:
# Reset logging level
av.logging.restore_default_callback()
return {"has_audio": False}

audio_info["audio.channels"] = audio_stream.channels
audio_info["audio.codec"] = audio_stream.codec.canonical_name
# In an ideal loseless case : bit depth x sample rate x channels = bit rate.
# In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
audio_info["audio.bit_rate"] = audio_stream.bit_rate
audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second
# In an ideal loseless case : fixed number of bits per sample.
# In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
audio_info["audio.bit_depth"] = audio_stream.format.bits
audio_info["audio.channel_layout"] = audio_stream.layout.name
audio_info["has_audio"] = True

# Reset logging level
av.logging.restore_default_callback()

return audio_info


def get_video_info(video_path: Path | str) -> dict:
ffprobe_video_cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt",
"-of",
"json",
str(video_path),
]
result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
raise RuntimeError(f"Error running ffprobe: {result.stderr}")

info = json.loads(result.stdout)
video_stream_info = info["streams"][0]

# Calculate fps from r_frame_rate
r_frame_rate = video_stream_info["r_frame_rate"]
num, denom = map(int, r_frame_rate.split("/"))
fps = num / denom

pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"])

video_info = {
"video.fps": fps,
"video.height": video_stream_info["height"],
"video.width": video_stream_info["width"],
"video.channels": pixel_channels,
"video.codec": video_stream_info["codec_name"],
"video.pix_fmt": video_stream_info["pix_fmt"],
"video.is_depth_map": False,
**get_audio_info(video_path),
}
# Set logging level
logging.getLogger("libav").setLevel(av.logging.ERROR)

# Getting video stream information
video_info = {}
with av.open(str(video_path), "r") as video_file:
try:
video_stream = video_file.streams.video[0]
except IndexError:
# Reset logging level
av.logging.restore_default_callback()
return {}

video_info["video.height"] = video_stream.height
video_info["video.width"] = video_stream.width
video_info["video.codec"] = video_stream.codec.canonical_name
video_info["video.pix_fmt"] = video_stream.pix_fmt
video_info["video.is_depth_map"] = False

# Calculate fps from r_frame_rate
video_info["video.fps"] = int(video_stream.base_rate)

pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
video_info["video.channels"] = pixel_channels

# Reset logging level
av.logging.restore_default_callback()

# Adding audio stream information
video_info.update(**get_audio_info(video_path))

return video_info

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ dependencies = [
"omegaconf>=2.3.0",
"opencv-python-headless>=4.9.0",
"packaging>=24.2",
"av>=12.0.5",
"av>=14.2.0",
"pymunk>=6.6.0",
"pynput>=1.7.7",
"pyzmq>=26.2.1",
Expand Down
Loading