|
13 | 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 | 14 | # See the License for the specific language governing permissions and
|
15 | 15 | # limitations under the License.
|
| 16 | +import glob |
16 | 17 | import importlib
|
17 |
| -import json |
18 | 18 | import logging
|
19 |
| -import subprocess |
20 | 19 | import warnings
|
21 |
| -from collections import OrderedDict |
22 | 20 | from dataclasses import dataclass, field
|
23 | 21 | from pathlib import Path
|
24 | 22 | from typing import Any, ClassVar
|
25 | 23 |
|
| 24 | +import av |
26 | 25 | import pyarrow as pa
|
27 | 26 | import torch
|
28 | 27 | import torchvision
|
@@ -252,51 +251,83 @@ def encode_video_frames(
|
252 | 251 | g: int | None = 2,
|
253 | 252 | crf: int | None = 30,
|
254 | 253 | fast_decode: int = 0,
|
255 |
| - log_level: str | None = "error", |
| 254 | + log_level: int | None = av.logging.ERROR, |
256 | 255 | overwrite: bool = False,
|
257 | 256 | ) -> None:
|
258 | 257 | """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
|
| 258 | + # Check encoder availability |
| 259 | + if vcodec not in ["h264", "hevc", "libsvtav1"]: |
| 260 | + raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.") |
| 261 | + |
259 | 262 | video_path = Path(video_path)
|
260 | 263 | imgs_dir = Path(imgs_dir)
|
261 |
| - video_path.parent.mkdir(parents=True, exist_ok=True) |
262 |
| - |
263 |
| - ffmpeg_args = OrderedDict( |
264 |
| - [ |
265 |
| - ("-f", "image2"), |
266 |
| - ("-r", str(fps)), |
267 |
| - ("-i", str(imgs_dir / "frame_%06d.png")), |
268 |
| - ("-vcodec", vcodec), |
269 |
| - ("-pix_fmt", pix_fmt), |
270 |
| - ] |
| 264 | + |
| 265 | + video_path.parent.mkdir(parents=True, exist_ok=overwrite) |
| 266 | + |
| 267 | + # Encoders/pixel formats incompatibility check |
| 268 | + if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p": |
| 269 | + logging.warning( |
| 270 | + f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'" |
| 271 | + ) |
| 272 | + pix_fmt = "yuv420p" |
| 273 | + |
| 274 | + # Get input frames |
| 275 | + template = "frame_" + ("[0-9]" * 6) + ".png" |
| 276 | + input_list = sorted( |
| 277 | + glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0]) |
271 | 278 | )
|
272 | 279 |
|
| 280 | + # Define video output frame size (assuming all input frames are the same size) |
| 281 | + if len(input_list) == 0: |
| 282 | + raise FileNotFoundError(f"No images found in {imgs_dir}.") |
| 283 | + dummy_image = Image.open(input_list[0]) |
| 284 | + width, height = dummy_image.size |
| 285 | + |
| 286 | + # Define video codec options |
| 287 | + video_options = {} |
| 288 | + |
273 | 289 | if g is not None:
|
274 |
| - ffmpeg_args["-g"] = str(g) |
| 290 | + video_options["g"] = str(g) |
275 | 291 |
|
276 | 292 | if crf is not None:
|
277 |
| - ffmpeg_args["-crf"] = str(crf) |
| 293 | + video_options["crf"] = str(crf) |
278 | 294 |
|
279 | 295 | if fast_decode:
|
280 |
| - key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune" |
| 296 | + key = "svtav1-params" if vcodec == "libsvtav1" else "tune" |
281 | 297 | value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
|
282 |
| - ffmpeg_args[key] = value |
| 298 | + video_options[key] = value |
283 | 299 |
|
| 300 | + # Set logging level |
284 | 301 | if log_level is not None:
|
285 |
| - ffmpeg_args["-loglevel"] = str(log_level) |
286 |
| - |
287 |
| - ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair] |
288 |
| - if overwrite: |
289 |
| - ffmpeg_args.append("-y") |
290 |
| - |
291 |
| - ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)] |
292 |
| - # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal |
293 |
| - subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL) |
| 302 | + # "While less efficient, it is generally preferable to modify logging with Python’s logging" |
| 303 | + logging.getLogger("libav").setLevel(log_level) |
| 304 | + |
| 305 | + # Create and open output file (overwrite by default) |
| 306 | + with av.open(str(video_path), "w") as output: |
| 307 | + output_stream = output.add_stream(vcodec, fps, options=video_options) |
| 308 | + output_stream.pix_fmt = pix_fmt |
| 309 | + output_stream.width = width |
| 310 | + output_stream.height = height |
| 311 | + |
| 312 | + # Loop through input frames and encode them |
| 313 | + for input_data in input_list: |
| 314 | + input_image = Image.open(input_data).convert("RGB") |
| 315 | + input_frame = av.VideoFrame.from_image(input_image) |
| 316 | + packet = output_stream.encode(input_frame) |
| 317 | + if packet: |
| 318 | + output.mux(packet) |
| 319 | + |
| 320 | + # Flush the encoder |
| 321 | + packet = output_stream.encode() |
| 322 | + if packet: |
| 323 | + output.mux(packet) |
| 324 | + |
| 325 | + # Reset logging level |
| 326 | + if log_level is not None: |
| 327 | + av.logging.restore_default_callback() |
294 | 328 |
|
295 | 329 | if not video_path.exists():
|
296 |
| - raise OSError( |
297 |
| - f"Video encoding did not work. File not found: {video_path}. " |
298 |
| - f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`" |
299 |
| - ) |
| 330 | + raise OSError(f"Video encoding did not work. File not found: {video_path}.") |
300 | 331 |
|
301 | 332 |
|
302 | 333 | @dataclass
|
@@ -332,78 +363,68 @@ def __call__(self):
|
332 | 363 |
|
333 | 364 |
|
334 | 365 | def get_audio_info(video_path: Path | str) -> dict:
|
335 |
| - ffprobe_audio_cmd = [ |
336 |
| - "ffprobe", |
337 |
| - "-v", |
338 |
| - "error", |
339 |
| - "-select_streams", |
340 |
| - "a:0", |
341 |
| - "-show_entries", |
342 |
| - "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration", |
343 |
| - "-of", |
344 |
| - "json", |
345 |
| - str(video_path), |
346 |
| - ] |
347 |
| - result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
348 |
| - if result.returncode != 0: |
349 |
| - raise RuntimeError(f"Error running ffprobe: {result.stderr}") |
350 |
| - |
351 |
| - info = json.loads(result.stdout) |
352 |
| - audio_stream_info = info["streams"][0] if info.get("streams") else None |
353 |
| - if audio_stream_info is None: |
354 |
| - return {"has_audio": False} |
355 |
| - |
356 |
| - # Return the information, defaulting to None if no audio stream is present |
357 |
| - return { |
358 |
| - "has_audio": True, |
359 |
| - "audio.channels": audio_stream_info.get("channels", None), |
360 |
| - "audio.codec": audio_stream_info.get("codec_name", None), |
361 |
| - "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None, |
362 |
| - "audio.sample_rate": int(audio_stream_info["sample_rate"]) |
363 |
| - if audio_stream_info.get("sample_rate") |
364 |
| - else None, |
365 |
| - "audio.bit_depth": audio_stream_info.get("bit_depth", None), |
366 |
| - "audio.channel_layout": audio_stream_info.get("channel_layout", None), |
367 |
| - } |
| 366 | + # Set logging level |
| 367 | + logging.getLogger("libav").setLevel(av.logging.ERROR) |
| 368 | + |
| 369 | + # Getting audio stream information |
| 370 | + audio_info = {} |
| 371 | + with av.open(str(video_path), "r") as audio_file: |
| 372 | + try: |
| 373 | + audio_stream = audio_file.streams.audio[0] |
| 374 | + except IndexError: |
| 375 | + # Reset logging level |
| 376 | + av.logging.restore_default_callback() |
| 377 | + return {"has_audio": False} |
| 378 | + |
| 379 | + audio_info["audio.channels"] = audio_stream.channels |
| 380 | + audio_info["audio.codec"] = audio_stream.codec.canonical_name |
| 381 | + # In an ideal loseless case : bit depth x sample rate x channels = bit rate. |
| 382 | + # In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied. |
| 383 | + audio_info["audio.bit_rate"] = audio_stream.bit_rate |
| 384 | + audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second |
| 385 | + # In an ideal loseless case : fixed number of bits per sample. |
| 386 | + # In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate). |
| 387 | + audio_info["audio.bit_depth"] = audio_stream.format.bits |
| 388 | + audio_info["audio.channel_layout"] = audio_stream.layout.name |
| 389 | + audio_info["has_audio"] = True |
| 390 | + |
| 391 | + # Reset logging level |
| 392 | + av.logging.restore_default_callback() |
| 393 | + |
| 394 | + return audio_info |
368 | 395 |
|
369 | 396 |
|
370 | 397 | def get_video_info(video_path: Path | str) -> dict:
|
371 |
| - ffprobe_video_cmd = [ |
372 |
| - "ffprobe", |
373 |
| - "-v", |
374 |
| - "error", |
375 |
| - "-select_streams", |
376 |
| - "v:0", |
377 |
| - "-show_entries", |
378 |
| - "stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt", |
379 |
| - "-of", |
380 |
| - "json", |
381 |
| - str(video_path), |
382 |
| - ] |
383 |
| - result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
384 |
| - if result.returncode != 0: |
385 |
| - raise RuntimeError(f"Error running ffprobe: {result.stderr}") |
386 |
| - |
387 |
| - info = json.loads(result.stdout) |
388 |
| - video_stream_info = info["streams"][0] |
389 |
| - |
390 |
| - # Calculate fps from r_frame_rate |
391 |
| - r_frame_rate = video_stream_info["r_frame_rate"] |
392 |
| - num, denom = map(int, r_frame_rate.split("/")) |
393 |
| - fps = num / denom |
394 |
| - |
395 |
| - pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"]) |
396 |
| - |
397 |
| - video_info = { |
398 |
| - "video.fps": fps, |
399 |
| - "video.height": video_stream_info["height"], |
400 |
| - "video.width": video_stream_info["width"], |
401 |
| - "video.channels": pixel_channels, |
402 |
| - "video.codec": video_stream_info["codec_name"], |
403 |
| - "video.pix_fmt": video_stream_info["pix_fmt"], |
404 |
| - "video.is_depth_map": False, |
405 |
| - **get_audio_info(video_path), |
406 |
| - } |
| 398 | + # Set logging level |
| 399 | + logging.getLogger("libav").setLevel(av.logging.ERROR) |
| 400 | + |
| 401 | + # Getting video stream information |
| 402 | + video_info = {} |
| 403 | + with av.open(str(video_path), "r") as video_file: |
| 404 | + try: |
| 405 | + video_stream = video_file.streams.video[0] |
| 406 | + except IndexError: |
| 407 | + # Reset logging level |
| 408 | + av.logging.restore_default_callback() |
| 409 | + return {} |
| 410 | + |
| 411 | + video_info["video.height"] = video_stream.height |
| 412 | + video_info["video.width"] = video_stream.width |
| 413 | + video_info["video.codec"] = video_stream.codec.canonical_name |
| 414 | + video_info["video.pix_fmt"] = video_stream.pix_fmt |
| 415 | + video_info["video.is_depth_map"] = False |
| 416 | + |
| 417 | + # Calculate fps from r_frame_rate |
| 418 | + video_info["video.fps"] = int(video_stream.base_rate) |
| 419 | + |
| 420 | + pixel_channels = get_video_pixel_channels(video_stream.pix_fmt) |
| 421 | + video_info["video.channels"] = pixel_channels |
| 422 | + |
| 423 | + # Reset logging level |
| 424 | + av.logging.restore_default_callback() |
| 425 | + |
| 426 | + # Adding audio stream information |
| 427 | + video_info.update(**get_audio_info(video_path)) |
407 | 428 |
|
408 | 429 | return video_info
|
409 | 430 |
|
|
0 commit comments