huggingface · ydshieh · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
@@ -43,8 +43,12 @@ env:
   HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
   TF_FORCE_GPU_ALLOW_GROWTH: true
   CUDA_VISIBLE_DEVICES: 0,1
+  # K8S instance-sharing makes each runner report the full machine's CPU RAM (~750 GB). Cap it to the
+  # amount a dedicated A10 instance would see, so `device_map="auto"` produces a sane device map.
+  CI_CPU_MEMORY_LIMIT_GB: 60
   NO_COLOR: 1
 
+
 permissions:
   contents: read
 

diff --git a/conftest.py b/conftest.py
@@ -27,14 +27,31 @@
 from transformers.testing_utils import (
     HfDoctestModule,
     HfDocTestParser,
+    backend_device_count,
     is_torch_available,
+    patch_psutil_cpu_memory,
     patch_testing_methods_to_collect_info,
     patch_torch_compile_force_graph,
+    torch_device,
 )
 from transformers.utils import enable_tf32
 from transformers.utils.network_logging import register_network_debug_plugin
 
 
+# In K8S instance-sharing CI, each runner sees the full machine's CPU RAM (~750 GB) even though it only
+# owns a fraction. This causes `device_map="auto"` to overfill GPU+CPU with nothing offloaded to disk,
+# leading to GPU OOM at runtime. When CI_CPU_MEMORY_LIMIT_GB is set, cap psutil.virtual_memory so the
+# entire test session sees a realistic per-runner memory budget.
+# On multi-accelerator runners the budget scales linearly: each accelerator earns one full slot of CPU RAM
+# (e.g. 4 GPUs × 60 GB = 240 GB), because device_map="auto" can legitimately use more CPU RAM for
+# intermediate storage when more GPU devices are present.
+_cpu_memory_limit_gb = os.environ.get("CI_CPU_MEMORY_LIMIT_GB")
+if _cpu_memory_limit_gb is not None:
+    _limit_per_device = int(float(_cpu_memory_limit_gb) * 1024**3)
+    _num_accelerators = max(1, backend_device_count(torch_device)) if torch_device is not None else 1
+    patch_psutil_cpu_memory(_limit_per_device * _num_accelerators)
+
+
 NOT_DEVICE_TESTS = {
     "test_tokenization",
     "test_tokenization_mistral_common",

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
@@ -3430,6 +3430,30 @@ def patched(*args, **kwargs):
         torch.compile = patched
 
 
+def patch_psutil_cpu_memory(limit_bytes: int):
+    """
+    Patch `psutil.virtual_memory` to cap the reported CPU memory to `limit_bytes`.
+
+    In K8S instance-sharing CI, each runner sees the full machine's CPU RAM (~750 GB) even though it only
+    owns a fraction. This causes `device_map="auto"` to overfill GPU+CPU with nothing offloaded to disk,
+    leading to GPU OOM at runtime. Calling this function caps `total`, `available`, `used`, and `percent`
+    so the entire test session sees a realistic per-runner memory budget.
+    """
+    import psutil
+
+    _original_virtual_memory = psutil.virtual_memory
+
+    def _capped_virtual_memory():
+        mem = _original_virtual_memory()
+        total = min(mem.total, limit_bytes)
+        available = min(mem.available, limit_bytes)
+        used = min(mem.used, total)
+        percent = 100 * used / total if total > 0 else 0.0
+        return mem._replace(total=total, available=available, used=used, percent=percent)
+
+    psutil.virtual_memory = _capped_virtual_memory
+
+
 def _get_test_info():
     """
     Collect some information about the current test.

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
@@ -14,6 +14,7 @@
 
 """Testing suite for the PyTorch PhiMoE model."""
 
+import tempfile
 import unittest
 
 from parameterized import parameterized
@@ -113,18 +114,27 @@ def test_model_rope_scaling_from_config(self, scaling_type):
 @require_torch
 class PhimoeIntegrationTest(unittest.TestCase):
     model = None
+    offload_dir = None
 
     @classmethod
     def get_model(cls):
         if cls.model is None:
+            cls.offload_dir = tempfile.TemporaryDirectory()
             cls.model = PhimoeForCausalLM.from_pretrained(
-                "microsoft/Phi-3.5-MoE-instruct", experts_implementation="eager", dtype="auto", device_map="auto"
+                "microsoft/Phi-3.5-MoE-instruct",
+                experts_implementation="eager",
+                dtype="auto",
+                device_map="auto",
+                offload_folder=cls.offload_dir.name,
             )
         return cls.model
 
     @classmethod
     def tearDownClass(cls):
         del cls.model
+        if cls.offload_dir is not None:
+            cls.offload_dir.cleanup()
+            cls.offload_dir = None
         cleanup(torch_device, gc_collect=True)
 
     def setUp(self):
@@ -144,8 +154,8 @@ def test_model_phimoe_instruct_logits(self):
 
         EXPECTED_OUTPUT = torch.tensor(
             [
-                    [-3.4844, -2.4531, -1.1719, 0.6055, -0.4922, -0.1001, 0.8086, -0.2422, 0.3477, -1.0078],
-                    [-0.9766, 0.1631, -0.5508, 2.3594, 0.7031, 3.1719, 0.4141, 0.2305, 0.6055, -2.1250],
+                [-3.5625, -2.4375, -1.3672, 0.3438, -0.7539, -0.4590, 0.6133, -0.4531, 0.2188, -1.2422],
+                [-0.9688, 0.3633, -0.4902, 2.3281, 0.6250, 3.1094, 0.3828, 0.1670, 0.5781, -2.1094],
             ]
         ).to(device=torch_device, dtype=output.dtype)  # fmt: skip