feat: cache capacity change option

savely-krasovsky · savely-krasovsky · commit fc82842d22d5 · 2025-11-04T12:29:00.000+01:00
diff --git a/.editorconfig b/.editorconfig
@@ -8,6 +8,9 @@ indent_size = 2
 insert_final_newline = true
 trim_trailing_whitespace = true
 
+[*.py]
+indent_size = 4
+
 [*.{ts,js}]
 quote_type = single
 
diff --git a/docs/docs/install/environment-variables.md b/docs/docs/install/environment-variables.md
@@ -172,6 +172,7 @@ Redis (Sentinel) URL example JSON before encoding:
 | `MACHINE_LEARNING_RKNN`                                     | Enable RKNN hardware acceleration if supported                                                      |             `True`              | machine learning |
 | `MACHINE_LEARNING_RKNN_THREADS`                             | How many threads of RKNN runtime should be spinned up while inferencing.                            |               `1`               | machine learning |
 | `MACHINE_LEARNING_MODEL_ARENA`                              | Pre-allocates CPU memory to avoid memory fragmentation                                              |              true               | machine learning |
+| `MACHINE_LEARNING_OPENVINO_CACHE_CAPACITY`                  | The max number of image dimensions for which models have cached optimizations                       |              `20`               | machine learning |
 
 \*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones.
 
diff --git a/machine-learning/immich_ml/config.py b/machine-learning/immich_ml/config.py
@@ -70,6 +70,7 @@ class Settings(BaseSettings):
     rknn_threads: int = 1
     preload: PreloadModelData | None = None
     max_batch_size: MaxBatchSize | None = None
+    openvino_cache_capacity: int = 20
 
     @property
     def device_id(self) -> str:
diff --git a/machine-learning/immich_ml/sessions/ort.py b/machine-learning/immich_ml/sessions/ort.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import onnxruntime as ort
+import orjson
 from numpy.typing import NDArray
 
 from immich_ml.models.constants import SUPPORTED_PROVIDERS
@@ -93,10 +94,18 @@ def _provider_options_default(self) -> list[dict[str, Any]]:
                 case "CUDAExecutionProvider" | "ROCMExecutionProvider":
                     options = {"arena_extend_strategy": "kSameAsRequested", "device_id": settings.device_id}
                 case "OpenVINOExecutionProvider":
+                    openvino_dir = self.model_path.parent / "openvino"
+                    openvino_dir.mkdir(parents=True, exist_ok=True)
+                    device = f"GPU.{settings.device_id}"
                     options = {
                         "device_type": f"GPU.{settings.device_id}",
                         "precision": "FP32",
-                        "cache_dir": (self.model_path.parent / "openvino").as_posix(),
+                        "cache_dir": openvino_dir.as_posix(),
+                        "load_config": {
+                            device: {
+                                "CPU_RUNTIME_CACHE_CAPACITY": str(settings.openvino_cache_capacity)
+                            },
+                        },
                     }
                 case "CoreMLExecutionProvider":
                     options = {
diff --git a/machine-learning/test_main.py b/machine-learning/test_main.py
@@ -239,18 +239,31 @@ def test_sets_provider_kwarg(self) -> None:
         assert session.providers == providers
 
     @pytest.mark.ov_device_ids(["GPU.0", "CPU"])
-    def test_sets_default_provider_options(self, ov_device_ids: list[str]) -> None:
+    def test_sets_default_provider_options(self, ov_device_ids: list[str], mocker: MockerFixture) -> None:
         model_path = "/cache/ViT-B-32__openai/model.onnx"
+        mock_mkdir = mocker.patch.object(Path, "mkdir")
+        mock_write_bytes = mocker.patch.object(Path, "write_bytes")
+
         session = OrtSession(model_path, providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"])
 
         assert session.provider_options == [
-            {"device_type": "GPU.0", "precision": "FP32", "cache_dir": "/cache/ViT-B-32__openai/openvino"},
+            {
+                "device_type": "GPU.0",
+                "precision": "FP32",
+                "cache_dir": "/cache/ViT-B-32__openai/openvino",
+                "load_config": "/cache/ViT-B-32__openai/textual/openvino/config.json",
+            },
             {"arena_extend_strategy": "kSameAsRequested"},
         ]
+        mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+        mock_write_bytes.assert_called_once_with("""{"GPU.0":{"CPU_RUNTIME_CACHE_CAPACITY":"20"}}""".encode())
 
-    def test_sets_provider_options_for_openvino(self) -> None:
+    def test_sets_provider_options_for_openvino(self, mocker: MockerFixture) -> None:
         model_path = "/cache/ViT-B-32__openai/textual/model.onnx"
         os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
+        mock_mkdir = mocker.patch.object(Path, "mkdir")
+        mock_write_bytes = mocker.patch.object(Path, "write_bytes")
+        mocker.patch.object(settings, "openvino_cache_capacity", 10)
 
         session = OrtSession(model_path, providers=["OpenVINOExecutionProvider"])
 
@@ -259,8 +272,11 @@ def test_sets_provider_options_for_openvino(self) -> None:
                 "device_type": "GPU.1",
                 "precision": "FP32",
                 "cache_dir": "/cache/ViT-B-32__openai/textual/openvino",
+                "load_config": "/cache/ViT-B-32__openai/textual/openvino/config.json",
             }
         ]
+        mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+        mock_write_bytes.assert_called_once_with("""{"GPU.1":{"CPU_RUNTIME_CACHE_CAPACITY":"10"}}""".encode())
 
     def test_sets_provider_options_for_cuda(self) -> None:
         os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
diff --git a/machine-learning/uv.lock b/machine-learning/uv.lock