[Core] Run garbage collector after CUDA graph capture to fix throughput regression (vllm-project#24128)

micah-wil · gshtras · web-flow · commit 1c63a16b653d · 2025-09-09T10:38:10.000-04:00
Signed-off-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
Co-authored-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2885,6 +2885,7 @@ def freeze_gc():
             finally:
                 if should_freeze:
                     gc.unfreeze()
+                    gc.collect()
 
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes