From 60b13bb4297c9d16cdbf98269577c1d097c13598 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 14 Feb 2025 09:14:51 +0800
Subject: [PATCH 01/73] wip: download uv

---
 engine/services/engine_service.cc | 77 +++++++++++++++++++++++++++++++
 engine/services/engine_service.h  |  7 +++
 engine/utils/engine_constants.h   |  2 +-
 engine/utils/process/utils.cc     | 12 ++---
 4 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 93cd8605c..da603bbd2 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -223,10 +223,87 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
   }
 }
 
+cpp::result<void, std::string> EngineService::DownloadPythonUv(const std::string& version) {
+  const std::string engine_name = kPythonEngine;
+  const std::string python_bin_path = file_manager_utils::GetEnginesContainerPath() /
+                                      engine_name / "bin";
+  std::filesystem::create_directories(python_bin_path);
+
+  const std::string uv_version = "0.5.30";
+
+  // NOTE: only works on MacOS and Linux
+  auto on_finished = [this, engine_name, python_bin_path, uv_version](const DownloadTask& finishedTask) {
+    // try to unzip the downloaded file
+    const std::string installer_path = finishedTask.items[0].localPath.string();
+    CTL_INF("UV install script path: " << installer_path);
+    CTL_INF("Version: " << uv_version);
+
+    // https://docs.astral.sh/uv/configuration/installer/
+    // TODO: move env var mod logic to SpawnProcess()
+    // using env to set env vars
+    // should we download from here instead? https://github.com/astral-sh/uv/releases
+    std::vector<std::string> command{"env",
+                                     "UV_UNMANAGED_INSTALL=" + python_bin_path,
+                                     "sh",
+                                     installer_path,
+                                     "-q"};
+    const auto pid = cortex::process::SpawnProcess(command);
+    if (pid == -1) {
+      CTL_ERR("Failed to install uv");
+    }
+    // wait for subprocess to finish
+    // TODO: need to check return status if successful
+    waitpid(pid, NULL, 0);
+
+    std::filesystem::remove(installer_path);
+
+    auto create_res = EngineService::UpsertEngine(
+      engine_name,
+      kLocal, "", "", uv_version, "", "Default", "");
+
+    if (create_res.has_value()) {
+      CTL_ERR("Failed to create engine entry: " << create_res->engine_name);
+    } else {
+      CTL_INF("Engine entry created successfully");
+    }
+
+  };
+
+  const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh";
+  auto downloadTask =
+    DownloadTask{.id = "uv",
+                 .type = DownloadType::Engine,
+                 .items = {DownloadItem{
+                      .id = "uv",
+                      .downloadUrl = url,
+                      .localPath = python_bin_path + "/install.sh",
+                  }}};
+
+  auto add_task_result = download_service_->AddTask(downloadTask, on_finished);
+  if (add_task_result.has_error()) {
+    return cpp::fail(add_task_result.error());
+  }
+  return {};
+}
+
 cpp::result<void, std::string> EngineService::DownloadEngine(
     const std::string& engine, const std::string& version,
     const std::optional<std::string> variant_name) {
 
+  if (engine == kLlamaRepo) {
+    return DownloadLlamaCpp(version, variant_name);
+  } else if (engine == kPythonEngine) {
+    return DownloadPythonUv(version);
+  }
+  // raise error here?
+  return {};
+}
+
+cpp::result<void, std::string> EngineService::DownloadLlamaCpp(
+  const std::string& version,
+  const std::optional<std::string> variant_name) {
+
+  const std::string engine = kLlamaRepo;
   auto normalized_version = version == "latest"
                                 ? "latest"
                                 : string_utils::RemoveSubstring(version, "v");
diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h
index f98037bab..6cce1761b 100644
--- a/engine/services/engine_service.h
+++ b/engine/services/engine_service.h
@@ -164,6 +164,13 @@ class EngineService : public EngineServiceI {
       const std::string& engine, const std::string& version = "latest",
       const std::optional<std::string> variant_name = std::nullopt);
 
+  cpp::result<void, std::string> DownloadLlamaCpp(
+      const std::string& version = "latest",
+      const std::optional<std::string> variant_name = std::nullopt);
+
+  cpp::result<void, std::string> DownloadPythonUv(
+      const std::string& version = "latest");
+
   cpp::result<bool, std::string> DownloadCuda(const std::string& engine,
                                               bool async = false);
 
diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h
index 35368c519..3cad230bc 100644
--- a/engine/utils/engine_constants.h
+++ b/engine/utils/engine_constants.h
@@ -1,7 +1,7 @@
 #pragma once
 
 constexpr const auto kLlamaEngine = "llama-cpp";
-constexpr const auto kPythonEngine = "python-engine";
+constexpr const auto kPythonEngine = "python";
 
 constexpr const auto kOpenAiEngine = "openai";
 constexpr const auto kAnthropicEngine = "anthropic";
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index fef425803..1b80f856d 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -80,12 +80,12 @@ pid_t SpawnProcess(const std::vector<std::string>& command) {
     auto argv = ConvertToArgv(command);
 
     // Use posix_spawn for cross-platform compatibility
-    auto spawn_result = posix_spawn(&pid,                // pid output
-                                    command[0].c_str(),  // executable path
-                                    NULL,                // file actions
-                                    NULL,                // spawn attributes
-                                    argv.data(),         // argument vector
-                                    environ  // environment (inherit)
+    auto spawn_result = posix_spawnp(&pid,                // pid output
+                                     command[0].c_str(),  // executable path
+                                     NULL,                // file actions
+                                     NULL,                // spawn attributes
+                                     argv.data(),         // argument vector
+                                     environ  // environment (inherit)
     );
 
     if (spawn_result != 0) {

From f9817c8833303b33eb2a4f406405e27718ee623a Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 14 Feb 2025 14:44:32 +0800
Subject: [PATCH 02/73] fix: has_value -> has_error

---
 engine/services/engine_service.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 42d61aab2..8d85f1079 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -261,7 +261,7 @@ cpp::result<void, std::string> EngineService::DownloadPythonUv(const std::string
       engine_name,
       kLocal, "", "", uv_version, "", "Default", "");
 
-    if (create_res.has_value()) {
+    if (create_res.has_error()) {
       CTL_ERR("Failed to create engine entry: " << create_res->engine_name);
     } else {
       CTL_INF("Engine entry created successfully");

From 2dbc29625712c236a4a6f2ef0f67291fb5b40406 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Feb 2025 17:05:00 +0800
Subject: [PATCH 03/73] move uv stuff to python_engine. use uv to start process

---
 .../extensions/python-engine/python_engine.cc | 202 ++++++++++++------
 .../extensions/python-engine/python_engine.h  |   6 +
 engine/services/engine_service.cc             |  73 +------
 3 files changed, 147 insertions(+), 134 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index d34f75c08..a1e8cec48 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -64,6 +64,63 @@ static size_t WriteCallback(char* ptr, size_t size, size_t nmemb,
 
 }  // namespace
 
+cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService> download_service) {
+  const std::string py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
+  std::filesystem::create_directories(py_bin_path);
+
+  const std::string uv_version = "0.5.31";
+
+  // NOTE: only works on MacOS and Linux
+  auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) {
+    // try to unzip the downloaded file
+    const std::string installer_path = finishedTask.items[0].localPath.string();
+    CTL_INF("UV install script path: " << installer_path);
+    CTL_INF("Version: " << uv_version);
+
+    // https://docs.astral.sh/uv/configuration/installer/
+    // TODO: move env var mod logic to SpawnProcess()
+    // using env to set env vars
+    // should we download from here instead? https://github.com/astral-sh/uv/releases
+    std::vector<std::string> command{"env",
+                                     "UV_UNMANAGED_INSTALL=" + py_bin_path,
+                                     "sh",
+                                     installer_path,
+                                     "-q"};
+    const auto pid = cortex::process::SpawnProcess(command);
+    if (pid == -1) {
+      CTL_ERR("Failed to install uv");
+    }
+    // wait for subprocess to finish
+    // TODO: need to check return status if successful
+    waitpid(pid, NULL, 0);
+    std::filesystem::remove(installer_path);
+  };
+
+  const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh";
+  auto downloadTask =
+    DownloadTask{.id = "uv",
+                 .type = DownloadType::Engine,
+                 .items = {DownloadItem{
+                      .id = "uv",
+                      .downloadUrl = url,
+                      .localPath = py_bin_path + "/install.sh",
+                  }}};
+
+  auto add_task_result = download_service->AddTask(downloadTask, on_finished);
+  if (add_task_result.has_error()) {
+    return cpp::fail(add_task_result.error());
+  }
+  return {};
+}
+
+std::string GetUvPath() {
+  return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
+}
+
+bool IsUvInstalled() {
+  return std::filesystem::exists(GetUvPath());
+}
+
 PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {}
 
 PythonEngine::~PythonEngine() {
@@ -237,74 +294,85 @@ void PythonEngine::LoadModel(
     return;
   }
 
-  if (!LoadModelConfig(model, model_path)) {
-    Json::Value error;
-    error["error"] = "Failed to load model configuration";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k500InternalServerError;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-  auto model_config = models_[model];
-  auto model_folder_path = model_config.files[0];
-  auto data_folder_path =
-      std::filesystem::path(model_folder_path) / std::filesystem::path("venv");
+  // loads yaml into models_
+  // if (!LoadModelConfig(model, model_path)) {
+  //   Json::Value error;
+  //   error["error"] = "Failed to load model configuration";
+  //   Json::Value status;
+  //   status["is_done"] = true;
+  //   status["has_error"] = true;
+  //   status["is_stream"] = false;
+  //   status["status_code"] = k500InternalServerError;
+  //   callback(std::move(status), std::move(error));
+  //   return;
+  // }
+  // auto model_config = models_[model];
+  // auto model_folder_path = model_config.files[0];
+  // CTL_INF(__func__ << ": model_folder_path=" << model_folder_path);
+
+  // auto data_folder_path =
+  //     std::filesystem::path(model_folder_path) / std::filesystem::path("venv");
   try {
-#if defined(_WIN32)
-    auto executable = std::filesystem::path(data_folder_path) /
-                      std::filesystem::path("Scripts");
-#else
-    auto executable =
-        std::filesystem::path(data_folder_path) / std::filesystem::path("bin");
-#endif
-
-    auto executable_str =
-        (executable / std::filesystem::path(model_config.command[0])).string();
-    auto command = model_config.command;
-    command[0] = executable_str;
-    command.push_back((std::filesystem::path(model_folder_path) /
-                       std::filesystem::path(model_config.script))
-                          .string());
-    std::list<std::string> args{"--port",
-                                model_config.port,
-                                "--log_path",
-                                (file_manager_utils::GetCortexLogPath() /
-                                 std::filesystem::path(model_config.log_path))
-                                    .string(),
-                                "--log_level",
-                                model_config.log_level};
-    if (!model_config.extra_params.isNull() &&
-        model_config.extra_params.isObject()) {
-      for (const auto& key : model_config.extra_params.getMemberNames()) {
-        const Json::Value& value = model_config.extra_params[key];
-
-        // Convert key to string with -- prefix
-        std::string param_key = "--" + key;
-
-        // Handle different JSON value types
-        if (value.isString()) {
-          args.emplace_back(param_key);
-          args.emplace_back(value.asString());
-        } else if (value.isInt()) {
-          args.emplace_back(param_key);
-          args.emplace_back(std::to_string(value.asInt()));
-        } else if (value.isDouble()) {
-          args.emplace_back(param_key);
-          args.emplace_back(std::to_string(value.asDouble()));
-        } else if (value.isBool()) {
-          // For boolean, only add the flag if true
-          if (value.asBool()) {
-            args.emplace_back(param_key);
-          }
-        }
-      }
-    }
-
-    // Add the parsed arguments to the command
-    command.insert(command.end(), args.begin(), args.end());
+// #if defined(_WIN32)
+//     auto executable = std::filesystem::path(data_folder_path) /
+//                       std::filesystem::path("Scripts");
+// #else
+//     auto executable =
+//         std::filesystem::path(data_folder_path) / std::filesystem::path("bin");
+// #endif
+
+//     auto executable_str =
+//         (executable / std::filesystem::path(model_config.command[0])).string();
+//     auto command = model_config.command;
+//     command[0] = executable_str;
+//     command.push_back((std::filesystem::path(model_folder_path) /
+//                        std::filesystem::path(model_config.script))
+//                           .string());
+//     std::list<std::string> args{"--port",
+//                                 model_config.port,
+//                                 "--log_path",
+//                                 (file_manager_utils::GetCortexLogPath() /
+//                                  std::filesystem::path(model_config.log_path))
+//                                     .string(),
+//                                 "--log_level",
+//                                 model_config.log_level};
+//     if (!model_config.extra_params.isNull() &&
+//         model_config.extra_params.isObject()) {
+//       for (const auto& key : model_config.extra_params.getMemberNames()) {
+//         const Json::Value& value = model_config.extra_params[key];
+
+//         // Convert key to string with -- prefix
+//         std::string param_key = "--" + key;
+
+//         // Handle different JSON value types
+//         if (value.isString()) {
+//           args.emplace_back(param_key);
+//           args.emplace_back(value.asString());
+//         } else if (value.isInt()) {
+//           args.emplace_back(param_key);
+//           args.emplace_back(std::to_string(value.asInt()));
+//         } else if (value.isDouble()) {
+//           args.emplace_back(param_key);
+//           args.emplace_back(std::to_string(value.asDouble()));
+//         } else if (value.isBool()) {
+//           // For boolean, only add the flag if true
+//           if (value.asBool()) {
+//             args.emplace_back(param_key);
+//           }
+//         }
+//       }
+//     }
+
+    // // Add the parsed arguments to the command
+    // command.insert(command.end(), args.begin(), args.end());
+
+    std::string uv_path = GetUvPath();
+    std::string entrypoint_path = std::filesystem::path(model_path).parent_path() / "main.py";
+    std::vector<std::string> command{uv_path, "run", entrypoint_path};
+
+    // TODO: what happens if the process exits?
+    // what should be expected from the subprocess
+    // TODO: stdout/stderr of subprocess
     pid = cortex::process::SpawnProcess(command);
     process_map_[model] = pid;
     if (pid == -1) {
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 70a9b9829..76d82c961 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -17,6 +17,7 @@
 #include "utils/process_status_utils.h"
 #include "utils/curl_utils.h"
 #include "utils/process/utils.h"
+#include "services/download_service.h"
 
 // Helper for CURL response
 namespace python_engine {
@@ -31,6 +32,11 @@ struct CurlResponse {
   std::string error_message;
 };
 
+// UV-related functions
+cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService> download_service);
+std::string GetUvPath();
+bool IsUvInstalled();
+
 class PythonEngine : public EngineI {
  private:
   // Model configuration
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 8d85f1079..56c52c14f 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -223,69 +223,6 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
   }
 }
 
-cpp::result<void, std::string> EngineService::DownloadPythonUv(const std::string& version) {
-  const std::string engine_name = kPythonEngine;
-  const std::string python_bin_path = file_manager_utils::GetEnginesContainerPath() /
-                                      engine_name / "bin";
-  std::filesystem::create_directories(python_bin_path);
-
-  const std::string uv_version = "0.5.30";
-
-  // NOTE: only works on MacOS and Linux
-  auto on_finished = [this, engine_name, python_bin_path, uv_version](const DownloadTask& finishedTask) {
-    // try to unzip the downloaded file
-    const std::string installer_path = finishedTask.items[0].localPath.string();
-    CTL_INF("UV install script path: " << installer_path);
-    CTL_INF("Version: " << uv_version);
-
-    // https://docs.astral.sh/uv/configuration/installer/
-    // TODO: move env var mod logic to SpawnProcess()
-    // using env to set env vars
-    // should we download from here instead? https://github.com/astral-sh/uv/releases
-    std::vector<std::string> command{"env",
-                                     "UV_UNMANAGED_INSTALL=" + python_bin_path,
-                                     "sh",
-                                     installer_path,
-                                     "-q"};
-    const auto pid = cortex::process::SpawnProcess(command);
-    if (pid == -1) {
-      CTL_ERR("Failed to install uv");
-    }
-    // wait for subprocess to finish
-    // TODO: need to check return status if successful
-    waitpid(pid, NULL, 0);
-
-    std::filesystem::remove(installer_path);
-
-    auto create_res = EngineService::UpsertEngine(
-      engine_name,
-      kLocal, "", "", uv_version, "", "Default", "");
-
-    if (create_res.has_error()) {
-      CTL_ERR("Failed to create engine entry: " << create_res->engine_name);
-    } else {
-      CTL_INF("Engine entry created successfully");
-    }
-
-  };
-
-  const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh";
-  auto downloadTask =
-    DownloadTask{.id = "uv",
-                 .type = DownloadType::Engine,
-                 .items = {DownloadItem{
-                      .id = "uv",
-                      .downloadUrl = url,
-                      .localPath = python_bin_path + "/install.sh",
-                  }}};
-
-  auto add_task_result = download_service_->AddTask(downloadTask, on_finished);
-  if (add_task_result.has_error()) {
-    return cpp::fail(add_task_result.error());
-  }
-  return {};
-}
-
 cpp::result<void, std::string> EngineService::DownloadEngine(
     const std::string& engine, const std::string& version,
     const std::optional<std::string> variant_name) {
@@ -293,10 +230,10 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
   if (engine == kLlamaRepo) {
     return DownloadLlamaCpp(version, variant_name);
   } else if (engine == kPythonEngine) {
-    return DownloadPythonUv(version);
+    // ignore version and variant_name
+    return python_engine::DownloadUv(download_service_);
   }
-  // raise error here?
-  return {};
+  return cpp::fail("Unknown engine " + engine);
 }
 
 cpp::result<void, std::string> EngineService::DownloadLlamaCpp(
@@ -988,9 +925,11 @@ cpp::result<bool, std::string> EngineService::IsEngineReady(
     return true;
   }
 
-  // End hard code
   // Check for python engine
   if (engine == kPythonEngine) {
+    if (!python_engine::IsUvInstalled()) {
+      return cpp::fail("Python engine is not ready. Please run `cortex engines install python`");
+    }
     return true;
   }
 

From eec24bd101f89b2b9593f0182fc64117588242d2 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Feb 2025 18:31:53 +0800
Subject: [PATCH 04/73] redirect stdout/stderr

---
 .../extensions/python-engine/python_engine.cc | 62 ++++---------------
 engine/utils/process/utils.cc                 | 39 +++++++++++-
 engine/utils/process/utils.h                  |  6 +-
 3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index a1e8cec48..3e01ab26e 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -321,59 +321,19 @@ void PythonEngine::LoadModel(
 //         std::filesystem::path(data_folder_path) / std::filesystem::path("bin");
 // #endif
 
-//     auto executable_str =
-//         (executable / std::filesystem::path(model_config.command[0])).string();
-//     auto command = model_config.command;
-//     command[0] = executable_str;
-//     command.push_back((std::filesystem::path(model_folder_path) /
-//                        std::filesystem::path(model_config.script))
-//                           .string());
-//     std::list<std::string> args{"--port",
-//                                 model_config.port,
-//                                 "--log_path",
-//                                 (file_manager_utils::GetCortexLogPath() /
-//                                  std::filesystem::path(model_config.log_path))
-//                                     .string(),
-//                                 "--log_level",
-//                                 model_config.log_level};
-//     if (!model_config.extra_params.isNull() &&
-//         model_config.extra_params.isObject()) {
-//       for (const auto& key : model_config.extra_params.getMemberNames()) {
-//         const Json::Value& value = model_config.extra_params[key];
-
-//         // Convert key to string with -- prefix
-//         std::string param_key = "--" + key;
-
-//         // Handle different JSON value types
-//         if (value.isString()) {
-//           args.emplace_back(param_key);
-//           args.emplace_back(value.asString());
-//         } else if (value.isInt()) {
-//           args.emplace_back(param_key);
-//           args.emplace_back(std::to_string(value.asInt()));
-//         } else if (value.isDouble()) {
-//           args.emplace_back(param_key);
-//           args.emplace_back(std::to_string(value.asDouble()));
-//         } else if (value.isBool()) {
-//           // For boolean, only add the flag if true
-//           if (value.asBool()) {
-//             args.emplace_back(param_key);
-//           }
-//         }
-//       }
-//     }
-
-    // // Add the parsed arguments to the command
-    // command.insert(command.end(), args.begin(), args.end());
-
-    std::string uv_path = GetUvPath();
-    std::string entrypoint_path = std::filesystem::path(model_path).parent_path() / "main.py";
-    std::vector<std::string> command{uv_path, "run", entrypoint_path};
+    const std::filesystem::path model_dir = std::filesystem::path(model_path).parent_path();
+    std::vector<std::string> command{GetUvPath(), "run", model_dir / "main.py"};
 
     // TODO: what happens if the process exits?
-    // what should be expected from the subprocess
-    // TODO: stdout/stderr of subprocess
-    pid = cortex::process::SpawnProcess(command);
+    const std::string stdout_path = model_dir / "stdout.txt";
+    const std::string stderr_path = model_dir / "stderr.txt";
+
+    // create empty stdout.txt and stderr.txt for redirection
+    if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush();
+    if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush();
+
+    pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path);
+
     process_map_[model] = pid;
     if (pid == -1) {
       std::unique_lock lock(models_mutex_);
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 1b80f856d..94433367b 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -1,8 +1,10 @@
 #include "utils/process/utils.h"
 #include "utils/logging_utils.h"
+#include <filesystem>
 
 #if defined(__APPLE__) || defined(__linux__)
 extern char **environ;  // environment variables
+#include <fcntl.h>
 #endif
 
 namespace cortex::process {
@@ -34,7 +36,9 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args) {
   return argv;
 }
 
-pid_t SpawnProcess(const std::vector<std::string>& command) {
+pid_t SpawnProcess(const std::vector<std::string>& command,
+                   const std::optional<std::string> stdout_file,
+                   const std::optional<std::string> stderr_file) {
   try {
 #if defined(_WIN32)
     // Windows process creation
@@ -79,15 +83,46 @@ pid_t SpawnProcess(const std::vector<std::string>& command) {
     // Convert command vector to char*[]
     auto argv = ConvertToArgv(command);
 
+    // redirect stdout and stderr
+    // caller should make sure the redirect files exist.
+    posix_spawn_file_actions_t *action_ptr = NULL;
+
+    if (stdout_file.has_value() || stderr_file.has_value()) {
+      posix_spawn_file_actions_t action;
+      posix_spawn_file_actions_init(&action);
+      action_ptr = &action;
+
+      if (stdout_file.has_value()) {
+        std::string stdout_file_val = stdout_file.value();
+        if (std::filesystem::exists(stdout_file_val)) {
+          posix_spawn_file_actions_addopen(&action, STDOUT_FILENO,
+                                           stdout_file_val.data(),
+                                           O_WRONLY | O_APPEND, 0);
+        }
+      }
+
+      if (stderr_file.has_value()) {
+        std::string stderr_file_val = stderr_file.value();
+        if (std::filesystem::exists(stderr_file_val)) {
+          posix_spawn_file_actions_addopen(&action, STDERR_FILENO,
+                                           stderr_file_val.data(),
+                                           O_WRONLY | O_APPEND, 0);
+        }
+      }
+    }
+
     // Use posix_spawn for cross-platform compatibility
     auto spawn_result = posix_spawnp(&pid,                // pid output
                                      command[0].c_str(),  // executable path
-                                     NULL,                // file actions
+                                     action_ptr,          // file actions
                                      NULL,                // spawn attributes
                                      argv.data(),         // argument vector
                                      environ  // environment (inherit)
     );
 
+    // NOTE: only destroy this when process ends?
+    // posix_spawn_file_actions_destroy(action_pr);
+
     if (spawn_result != 0) {
       throw std::runtime_error("Failed to spawn process");
     }
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index 9332607e9..54f34e919 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -20,6 +20,8 @@ std::string ConstructWindowsCommandLine(const std::vector<std::string>& args);
 
 std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 
-pid_t SpawnProcess(const std::vector<std::string>& command);
+pid_t SpawnProcess(const std::vector<std::string>& command,
+                   const std::optional<std::string> stdout_file = {},
+                   const std::optional<std::string> stderr_file = {});
 
-}
\ No newline at end of file
+}

From 26fdbd399ec7fe43d6f23b4877916d5774c4ff99 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Feb 2025 20:05:26 +0800
Subject: [PATCH 05/73] simplify code

---
 .../extensions/python-engine/python_engine.cc | 41 +++++--------------
 engine/services/model_service.cc              | 37 ++---------------
 2 files changed, 14 insertions(+), 64 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 3e01ab26e..8c6f6a7b7 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -266,11 +266,10 @@ void PythonEngine::GetModels(
 void PythonEngine::LoadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  // TODO: handle a case that can spawn process but the process spawn fail.
-  pid_t pid;
-  if (!json_body->isMember("model") || !json_body->isMember("model_path")) {
+
+  if (!json_body->isMember("model") || !json_body->isMember("model_dir")) {
     Json::Value error;
-    error["error"] = "Missing required fields: model or model_path";
+    error["error"] = "Missing required fields: model or model_dir";
     Json::Value status;
     status["is_done"] = true;
     status["has_error"] = true;
@@ -280,8 +279,11 @@ void PythonEngine::LoadModel(
     return;
   }
 
+  namespace fs = std::filesystem;
+
   const std::string& model = (*json_body)["model"].asString();
-  const std::string& model_path = (*json_body)["model_path"].asString();
+  const fs::path model_dir = (*json_body)["model_dir"].asString();
+
   if (models_.find(model) != models_.end()) {
     Json::Value error;
     error["error"] = "Model already loaded!";
@@ -294,6 +296,9 @@ void PythonEngine::LoadModel(
     return;
   }
 
+  // TODO: handle a case that can spawn process but the process spawn fail.
+  pid_t pid;
+
   // loads yaml into models_
   // if (!LoadModelConfig(model, model_path)) {
   //   Json::Value error;
@@ -310,18 +315,7 @@ void PythonEngine::LoadModel(
   // auto model_folder_path = model_config.files[0];
   // CTL_INF(__func__ << ": model_folder_path=" << model_folder_path);
 
-  // auto data_folder_path =
-  //     std::filesystem::path(model_folder_path) / std::filesystem::path("venv");
   try {
-// #if defined(_WIN32)
-//     auto executable = std::filesystem::path(data_folder_path) /
-//                       std::filesystem::path("Scripts");
-// #else
-//     auto executable =
-//         std::filesystem::path(data_folder_path) / std::filesystem::path("bin");
-// #endif
-
-    const std::filesystem::path model_dir = std::filesystem::path(model_path).parent_path();
     std::vector<std::string> command{GetUvPath(), "run", model_dir / "main.py"};
 
     // TODO: what happens if the process exits?
@@ -336,20 +330,7 @@ void PythonEngine::LoadModel(
 
     process_map_[model] = pid;
     if (pid == -1) {
-      std::unique_lock lock(models_mutex_);
-      if (models_.find(model) != models_.end()) {
-        models_.erase(model);
-      }
-
-      Json::Value error;
-      error["error"] = "Fail to spawn process with pid -1";
-      Json::Value status;
-      status["is_done"] = true;
-      status["has_error"] = true;
-      status["is_stream"] = false;
-      status["status_code"] = k500InternalServerError;
-      callback(std::move(status), std::move(error));
-      return;
+      throw std::runtime_error("Fail to spawn process with pid -1");
     }
   } catch (const std::exception& e) {
     std::unique_lock lock(models_mutex_);
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 6dc1642fb..15d5a8dc6 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -805,38 +805,13 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 
       // Check if Python model first
       if (mc.engine == kPythonEngine) {
-
-        config::PythonModelConfig python_model_config;
-        python_model_config.ReadFromYaml(
-
-            fmu::ToAbsoluteCortexDataPath(
-                fs::path(model_entry.value().path_to_model_yaml))
-                .string());
-        // Start all depends model
-        auto depends = python_model_config.depends;
-        for (auto& depend : depends) {
-          Json::Value temp;
-          auto res = StartModel(depend, temp, false);
-          if (res.has_error()) {
-            CTL_WRN("Error: " + res.error());
-            for (auto& depend : depends) {
-              if (depend != model_handle) {
-                StopModel(depend);
-              }
-            }
-            return cpp::fail("Model failed to start dependency '" + depend +
-                             "' : " + res.error());
-          }
-        }
+        const std::string model_yaml_path = model_entry.value().path_to_model_yaml;
 
         json_data["model"] = model_handle;
-        json_data["model_path"] =
-            fmu::ToAbsoluteCortexDataPath(
-                fs::path(model_entry.value().path_to_model_yaml))
-                .string();
+        json_data["model_dir"] = fmu::ToAbsoluteCortexDataPath(
+                                    fs::path(model_yaml_path).parent_path()).string();
         json_data["engine"] = mc.engine;
         assert(!!inference_svc_);
-        // Check if python engine
 
         auto ir =
             inference_svc_->LoadModel(std::make_shared<Json::Value>(json_data));
@@ -848,12 +823,6 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
         } else if (status == drogon::k409Conflict) {
           CTL_INF("Model '" + model_handle + "' is already loaded");
           return StartModelResult{.success = true, .warning = ""};
-        } else {
-          // only report to user the error
-          for (auto& depend : depends) {
-
-            StopModel(depend);
-          }
         }
         CTL_ERR("Model failed to start with status code: " << status);
         return cpp::fail("Model failed to start: " +

From 3ba79942dfdf9596b5d345b9288e2ef14965b33d Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 09:08:39 +0800
Subject: [PATCH 06/73] rename python engine interface

---
 engine/cortex-common/{cortexpythoni.h => python_enginei.h} | 7 +++----
 engine/services/engine_service.h                           | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)
 rename engine/cortex-common/{cortexpythoni.h => python_enginei.h} (87%)

diff --git a/engine/cortex-common/cortexpythoni.h b/engine/cortex-common/python_enginei.h
similarity index 87%
rename from engine/cortex-common/cortexpythoni.h
rename to engine/cortex-common/python_enginei.h
index 06a79838f..54e79bf2a 100644
--- a/engine/cortex-common/cortexpythoni.h
+++ b/engine/cortex-common/python_enginei.h
@@ -5,9 +5,9 @@
 
 #include "json/value.h"
 
-class CortexPythonEngineI {
+class PythonEngineI {
  public:
-  virtual ~CortexPythonEngineI() {}
+  virtual ~PythonEngineI() {}
 
   virtual bool IsSupported(const std::string& f) = 0;
 
@@ -17,6 +17,5 @@ class CortexPythonEngineI {
 
   virtual void HandlePythonFileExecutionRequest(
       std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;  
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 };
-
diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h
index 6cce1761b..a8d5415a0 100644
--- a/engine/services/engine_service.h
+++ b/engine/services/engine_service.h
@@ -9,7 +9,7 @@
 
 #include "common/engine_servicei.h"
 #include "cortex-common/EngineI.h"
-#include "cortex-common/cortexpythoni.h"
+#include "cortex-common/python_enginei.h"
 #include "cortex-common/remote_enginei.h"
 #include "database/engines.h"
 #include "services/database_service.h"
@@ -37,7 +37,7 @@ struct EngineUpdateResult {
   }
 };
 
-using EngineV = std::variant<EngineI*, CortexPythonEngineI*, RemoteEngineI*>;
+using EngineV = std::variant<EngineI*, PythonEngineI*, RemoteEngineI*>;
 
 class EngineService : public EngineServiceI {
  private:

From 5e7125f09afb7d0cad4aadff5346a1300c51d396 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 09:31:24 +0800
Subject: [PATCH 07/73] use PythonEngineI

---
 engine/cortex-common/python_enginei.h         |  20 +-
 .../extensions/python-engine/python_engine.cc | 717 +-----------------
 .../extensions/python-engine/python_engine.h  |  69 +-
 engine/services/inference_service.cc          |  74 +-
 engine/services/inference_service.h           |   4 +
 5 files changed, 69 insertions(+), 815 deletions(-)

diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h
index 54e79bf2a..31dc76c80 100644
--- a/engine/cortex-common/python_enginei.h
+++ b/engine/cortex-common/python_enginei.h
@@ -9,13 +9,23 @@ class PythonEngineI {
  public:
   virtual ~PythonEngineI() {}
 
-  virtual bool IsSupported(const std::string& f) = 0;
+  // virtual bool IsSupported(const std::string& f) = 0;
 
-  virtual void ExecutePythonFile(std::string binary_execute_path,
-                                 std::string file_execution_path,
-                                 std::string python_library_path) = 0;
+  // virtual void ExecutePythonFile(std::string binary_execute_path,
+  //                                std::string file_execution_path,
+  //                                std::string python_library_path) = 0;
 
-  virtual void HandlePythonFileExecutionRequest(
+  // virtual void HandlePythonFileExecutionRequest(
+  //     std::shared_ptr<Json::Value> json_body,
+  //     std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  virtual void LoadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  virtual void HandleRequest(
+      const std::string& model,
+      const std::vector<std::string>& path_parts,
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 };
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 8c6f6a7b7..7ab970127 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -64,7 +64,7 @@ static size_t WriteCallback(char* ptr, size_t size, size_t nmemb,
 
 }  // namespace
 
-cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService> download_service) {
+cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service) {
   const std::string py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
   std::filesystem::create_directories(py_bin_path);
 
@@ -127,142 +127,6 @@ PythonEngine::~PythonEngine() {
   curl_global_cleanup();
 }
 
-config::PythonModelConfig* PythonEngine::GetModelConfig(
-    const std::string& model) {
-  std::shared_lock lock(models_mutex_);
-  auto it = models_.find(model);
-  if (it != models_.end()) {
-    return &it->second;
-  }
-  return nullptr;
-}
-
-bool PythonEngine::TerminateModelProcess(const std::string& model) {
-  auto it = process_map_.find(model);
-  if (it == process_map_.end()) {
-    LOG_ERROR << "No process found for model: " << model
-              << ", removing from list running models.";
-    models_.erase(model);
-    return false;
-  }
-
-#if defined(_WIN32)
-  HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, it->second);
-  if (hProcess == NULL) {
-    LOG_ERROR << "Failed to open process";
-    return false;
-  }
-
-  bool terminated = TerminateProcess(hProcess, 0) == TRUE;
-  CloseHandle(hProcess);
-
-  if (terminated) {
-    process_map_.erase(it);
-    return true;
-  }
-
-#elif defined(__APPLE__) || defined(__linux__)
-  int result = kill(it->second, SIGTERM);
-  if (result == 0) {
-    process_map_.erase(it);
-    return true;
-  }
-#endif
-
-  return false;
-}
-
-CurlResponse PythonEngine::MakeGetRequest(const std::string& model,
-                                          const std::string& path) {
-  auto const& config = models_[model];
-  std::string full_url = "http://localhost:" + config.port + path;
-  CurlResponse response;
-
-  auto result = curl_utils::SimpleRequest(full_url, RequestType::GET);
-  if (result.has_error()) {
-    response.error = true;
-    response.error_message = result.error();
-  } else {
-    response.body = result.value();
-  }
-  return response;
-}
-
-CurlResponse PythonEngine::MakeDeleteRequest(const std::string& model,
-                                             const std::string& path) {
-  auto const& config = models_[model];
-  std::string full_url = "http://localhost:" + config.port + path;
-  CurlResponse response;
-
-  auto result = curl_utils::SimpleRequest(full_url, RequestType::DEL);
-
-  if (result.has_error()) {
-    response.error = true;
-    response.error_message = result.error();
-  } else {
-    response.body = result.value();
-  }
-
-  return response;
-}
-
-CurlResponse PythonEngine::MakePostRequest(const std::string& model,
-                                           const std::string& path,
-                                           const std::string& body) {
-  auto const& config = models_[model];
-  std::string full_url = "http://localhost:" + config.port + path;
-
-  CurlResponse response;
-  auto result = curl_utils::SimpleRequest(full_url, RequestType::POST, body);
-
-  if (result.has_error()) {
-    response.error = true;
-    response.error_message = result.error();
-  } else {
-    response.body = result.value();
-  }
-  return response;
-}
-
-bool PythonEngine::LoadModelConfig(const std::string& model,
-                                   const std::string& yaml_path) {
-  try {
-    config::PythonModelConfig config;
-    config.ReadFromYaml(yaml_path);
-    std::unique_lock lock(models_mutex_);
-    models_[model] = config;
-  } catch (const std::exception& e) {
-    LOG_ERROR << "Failed to load model config: " << e.what();
-    return false;
-  }
-
-  return true;
-}
-
-void PythonEngine::GetModels(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  Json::Value response_json;
-  Json::Value model_array(Json::arrayValue);
-
-  for (const auto& pair : models_) {
-    auto val = pair.second.ToJson();
-    model_array.append(val);
-  }
-
-  response_json["object"] = "list";
-  response_json["data"] = model_array;
-
-  Json::Value status;
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-
-  callback(std::move(status), std::move(response_json));
-}
-
 void PythonEngine::LoadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
@@ -296,25 +160,8 @@ void PythonEngine::LoadModel(
     return;
   }
 
-  // TODO: handle a case that can spawn process but the process spawn fail.
   pid_t pid;
 
-  // loads yaml into models_
-  // if (!LoadModelConfig(model, model_path)) {
-  //   Json::Value error;
-  //   error["error"] = "Failed to load model configuration";
-  //   Json::Value status;
-  //   status["is_done"] = true;
-  //   status["has_error"] = true;
-  //   status["is_stream"] = false;
-  //   status["status_code"] = k500InternalServerError;
-  //   callback(std::move(status), std::move(error));
-  //   return;
-  // }
-  // auto model_config = models_[model];
-  // auto model_folder_path = model_config.files[0];
-  // CTL_INF(__func__ << ": model_folder_path=" << model_folder_path);
-
   try {
     std::vector<std::string> command{GetUvPath(), "run", model_dir / "main.py"};
 
@@ -360,568 +207,24 @@ void PythonEngine::LoadModel(
   callback(std::move(status), std::move(response));
 }
 
-void PythonEngine::UnloadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  if (!json_body->isMember("model")) {
-    Json::Value error;
-    error["error"] = "Missing required field: model";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  auto model = (*json_body)["model"].asString();
+void PythonEngine::HandleRequest(
+  const std::string& model,
+  const std::vector<std::string>& path_parts,
+  std::shared_ptr<Json::Value> json_body,
+  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
-  {
-    if (TerminateModelProcess(model)) {
-      std::unique_lock lock(models_mutex_);
-      models_.erase(model);
-    } else {
-      Json::Value error;
-      error["error"] = "Fail to terminate process with id: " +
-                       std::to_string(process_map_[model]);
-      Json::Value status;
-      status["is_done"] = true;
-      status["has_error"] = true;
-      status["is_stream"] = false;
-      status["status_code"] = k400BadRequest;
-      callback(std::move(status), std::move(error));
-      return;
-    }
-  }
-
-  Json::Value response;
-  response["status"] = "Model unloaded successfully";
-  Json::Value status;
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-  callback(std::move(status), std::move(response));
-}
-
-void PythonEngine::HandleChatCompletion(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  LOG_WARN << "Does not support yet!";
-}
-
-CurlResponse PythonEngine::MakeStreamPostRequest(
-    const std::string& model, const std::string& path, const std::string& body,
-    const std::function<void(Json::Value&&, Json::Value&&)>& callback) {
-  auto const& config = models_[model];
-  CURL* curl = curl_easy_init();
-  CurlResponse response;
-
-  if (!curl) {
-    response.error = true;
-    response.error_message = "Failed to initialize CURL";
-    return response;
-  }
-
-  std::string full_url = "http://localhost:" + config.port + path;
-
-  struct curl_slist* headers = nullptr;
-  headers = curl_slist_append(headers, "Content-Type: application/json");
-  headers = curl_slist_append(headers, "Accept: text/event-stream");
-  headers = curl_slist_append(headers, "Cache-Control: no-cache");
-  headers = curl_slist_append(headers, "Connection: keep-alive");
-
-  StreamContext context{
-      std::make_shared<std::function<void(Json::Value&&, Json::Value&&)>>(
-          callback),
-      ""};
-
-  curl_easy_setopt(curl, CURLOPT_URL, full_url.c_str());
-  curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-  curl_easy_setopt(curl, CURLOPT_POST, 1L);
-  curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
-  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, StreamWriteCallback);
-  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &context);
-  curl_easy_setopt(curl, CURLOPT_TRANSFER_ENCODING, 1L);
-
-  CURLcode res = curl_easy_perform(curl);
-
-  if (res != CURLE_OK) {
-    response.error = true;
-    response.error_message = curl_easy_strerror(res);
-
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = true;
-    status["status_code"] = 500;
-
-    Json::Value error;
-    error["error"] = response.error_message;
-    callback(std::move(status), std::move(error));
-  }
-
-  curl_slist_free_all(headers);
-  curl_easy_cleanup(curl);
-  return response;
-}
-
-void PythonEngine::HandleInference(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  if (json_body && !json_body->isMember("model")) {
-    Json::Value error;
-    error["error"] = "Missing required field: model is required!";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  std::string method = "post";
-  std::string path = "/inference";
-  auto transform_request = (*json_body).get("transform_request", "").asString();
-  auto transform_response =
-      (*json_body).get("transform_response", "").asString();
-  auto model = (*json_body)["model"].asString();
-  auto& body = (*json_body)["body"];
-
-  if (models_.find(model) == models_.end()) {
-    Json::Value error;
-    error["error"] = "Model '" + model + "' is not loaded!";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  // Transform Request
-  std::string transformed_request;
-  if (!transform_request.empty()) {
-
-    try {
-      // Validate JSON body
-      if (!body || body.isNull()) {
-        throw std::runtime_error("Invalid or null JSON body");
-      }
-
-      // Render with error handling
-      try {
-        transformed_request = renderer_.Render(transform_request, body);
-
-      } catch (const std::exception& e) {
-        throw std::runtime_error("Template rendering error: " +
-                                 std::string(e.what()));
-      }
-    } catch (const std::exception& e) {
-      // Log error and potentially rethrow or handle accordingly
-      LOG_WARN << "Error in TransformRequest: " << e.what();
-      LOG_WARN << "Using original request body";
-      transformed_request = body.toStyledString();
-    }
-  } else {
-    transformed_request = body.toStyledString();
-  }
-
-  // End Transform request
-
-  CurlResponse response;
-  if (method == "post") {
-    if (body.isMember("stream") && body["stream"].asBool()) {
-      q_.runTaskInQueue(
-          [this, model, path, transformed_request, cb = std::move(callback)] {
-            MakeStreamPostRequest(model, path, transformed_request, cb);
-          });
-
-      return;
-    } else {
-      response = MakePostRequest(model, path, transformed_request);
-    }
-
-  } else if (method == "get") {
-    response = MakeGetRequest(model, path);
-  } else if (method == "delete") {
-    response = MakeDeleteRequest(model, path);
-  } else {
-    Json::Value error;
-    error["error"] =
-        "method not supported! Supported methods are: post, get, delete";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  if (response.error) {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    Json::Value error;
-    error["error"] = response.error_message;
-    callback(std::move(status), std::move(error));
-    return;
-  }
+  // get port
 
   Json::Value response_json;
-  Json::Reader reader;
-  if (!reader.parse(response.body, response_json)) {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k500InternalServerError;
-    Json::Value error;
-    error["error"] = "Failed to parse response";
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  if (!transform_response.empty()) {
-    // Transform Response
-    std::string response_str;
-    try {
-      // Validate JSON body
-      if (!response_json || response_json.isNull()) {
-        throw std::runtime_error("Invalid or null JSON body");
-      }
-      // Render with error handling
-      try {
-        response_str = renderer_.Render(transform_response, response_json);
-      } catch (const std::exception& e) {
-        throw std::runtime_error("Template rendering error: " +
-                                 std::string(e.what()));
-      }
-    } catch (const std::exception& e) {
-      // Log error and potentially rethrow or handle accordingly
-      LOG_WARN << "Error in TransformRequest: " << e.what();
-      LOG_WARN << "Using original request body";
-      response_str = response_json.toStyledString();
-    }
-
-    Json::Reader reader_final;
-    Json::Value response_json_final;
-    if (!reader_final.parse(response_str, response_json_final)) {
-      Json::Value status;
-      status["is_done"] = true;
-      status["has_error"] = true;
-      status["is_stream"] = false;
-      status["status_code"] = k500InternalServerError;
-      Json::Value error;
-      error["error"] = "Failed to parse response";
-      callback(std::move(status), std::move(error));
-      return;
-    }
-
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-
-    callback(std::move(status), std::move(response_json_final));
-  } else {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-
-    callback(std::move(status), std::move(response_json));
-  }
-}
-
-Json::Value PythonEngine::GetRemoteModels() {
-  return Json::Value();
-}
-
-void PythonEngine::StopInferencing(const std::string& model_id) {}
-
-void PythonEngine::HandleRouteRequest(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  if (!json_body->isMember("model") || !json_body->isMember("method") ||
-      !json_body->isMember("path")) {
-    Json::Value error;
-    error["error"] =
-        "Missing required field: model, method and path are required!";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-  auto method = (*json_body)["method"].asString();
-  auto path = (*json_body)["path"].asString();
-  auto transform_request = (*json_body).get("transform_request", "").asString();
-  auto transform_response =
-      (*json_body).get("transform_response", "").asString();
-  auto model = (*json_body)["model"].asString();
-  auto& body = (*json_body)["body"];
-
-  if (models_.find(model) == models_.end()) {
-    Json::Value error;
-    error["error"] = "Model '" + model + "' is not loaded!";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  // Transform Request
-  std::string transformed_request;
-  if (!transform_request.empty()) {
-
-    try {
-      // Validate JSON body
-      if (!body || body.isNull()) {
-        throw std::runtime_error("Invalid or null JSON body");
-      }
-
-      // Render with error handling
-      try {
-        transformed_request = renderer_.Render(transform_request, *json_body);
-      } catch (const std::exception& e) {
-        throw std::runtime_error("Template rendering error: " +
-                                 std::string(e.what()));
-      }
-    } catch (const std::exception& e) {
-      // Log error and potentially rethrow or handle accordingly
-      LOG_WARN << "Error in TransformRequest: " << e.what();
-      LOG_WARN << "Using original request body";
-      transformed_request = body.toStyledString();
-    }
-  } else {
-    transformed_request = body.toStyledString();
-  }
-
-  // End Transform request
-
-  CurlResponse response;
-  if (method == "post") {
-    response = MakePostRequest(model, path, transformed_request);
-  } else if (method == "get") {
-    response = MakeGetRequest(model, path);
-  } else if (method == "delete") {
-    response = MakeDeleteRequest(model, path);
-  } else {
-    Json::Value error;
-    error["error"] =
-        "method not supported! Supported methods are: post, get, delete";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  if (response.error) {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    Json::Value error;
-    error["error"] = response.error_message;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  Json::Value response_json;
-  Json::Reader reader;
-  if (!reader.parse(response.body, response_json)) {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k500InternalServerError;
-    Json::Value error;
-    error["error"] = "Failed to parse response";
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  if (!transform_response.empty()) {
-    // Transform Response
-    std::string response_str;
-    try {
-      // Validate JSON body
-      if (!response_json || response_json.isNull()) {
-        throw std::runtime_error("Invalid or null JSON body");
-      }
-      // Render with error handling
-      try {
-        response_str = renderer_.Render(transform_response, response_json);
-      } catch (const std::exception& e) {
-        throw std::runtime_error("Template rendering error: " +
-                                 std::string(e.what()));
-      }
-    } catch (const std::exception& e) {
-      // Log error and potentially rethrow or handle accordingly
-      LOG_WARN << "Error in TransformRequest: " << e.what();
-      LOG_WARN << "Using original request body";
-      response_str = response_json.toStyledString();
-    }
-
-    Json::Reader reader_final;
-    Json::Value response_json_final;
-    if (!reader_final.parse(response_str, response_json_final)) {
-      Json::Value status;
-      status["is_done"] = true;
-      status["has_error"] = true;
-      status["is_stream"] = false;
-      status["status_code"] = k500InternalServerError;
-      Json::Value error;
-      error["error"] = "Failed to parse response";
-      callback(std::move(status), std::move(error));
-      return;
-    }
-
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-
-    callback(std::move(status), std::move(response_json_final));
-  } else {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-
-    callback(std::move(status), std::move(response_json));
-  }
-}
-
-void PythonEngine::GetModelStatus(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  if (!json_body->isMember("model")) {
-    Json::Value error;
-    error["error"] = "Missing required field: model";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  auto model = json_body->get("model", "").asString();
-  auto model_config = models_[model];
-  auto health_endpoint = model_config.heath_check;
-  auto pid = process_map_[model];
-  auto is_process_live = process_status_utils::IsProcessRunning(pid);
-  auto response_health = MakeGetRequest(model, health_endpoint.path);
-
-  if (response_health.error && is_process_live) {
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-    Json::Value message;
-    message["message"] = "model '"+model+"' is loading";
-    callback(std::move(status), std::move(message));
-    return;
-  }
-  else if(response_health.error && !is_process_live){
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    Json::Value message;
-    message["message"] = response_health.error_message;
-    callback(std::move(status), std::move(message));
-    return;
-  }
-
-  Json::Value response;
-  response["model"] = model;
-  response["model_loaded"] = true;
-  response["model_data"] = model_config.ToJson();
+  response_json["object"] = "list";
 
   Json::Value status;
   status["is_done"] = true;
   status["has_error"] = false;
   status["is_stream"] = false;
   status["status_code"] = k200OK;
-  callback(std::move(status), std::move(response));
-}
 
-// Implement remaining virtual functions
-void PythonEngine::HandleEmbedding(
-    std::shared_ptr<Json::Value>,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  callback(Json::Value(), Json::Value());
-}
-
-bool PythonEngine::IsSupported(const std::string& f) {
-  if (f == "HandleChatCompletion" || f == "LoadModel" || f == "UnloadModel" ||
-      f == "GetModelStatus" || f == "GetModels" || f == "SetFileLogger" ||
-      f == "SetLogLevel") {
-    return true;
-  }
-  return false;
-}
-
-bool PythonEngine::SetFileLogger(int max_log_lines,
-                                 const std::string& log_path) {
-  if (!async_file_logger_) {
-    async_file_logger_ = std::make_unique<trantor::FileLogger>();
-  }
-
-  async_file_logger_->setFileName(log_path);
-  async_file_logger_->setMaxLines(max_log_lines);  // Keep last 100000 lines
-  async_file_logger_->startLogging();
-  trantor::Logger::setOutputFunction(
-      [&](const char* msg, const uint64_t len) {
-        if (async_file_logger_)
-          async_file_logger_->output_(msg, len);
-      },
-      [&]() {
-        if (async_file_logger_)
-          async_file_logger_->flush();
-      });
-  freopen(log_path.c_str(), "w", stderr);
-  freopen(log_path.c_str(), "w", stdout);
-  return true;
-}
-
-void PythonEngine::SetLogLevel(trantor::Logger::LogLevel log_level) {
-  trantor::Logger::setLogLevel(log_level);
+  callback(std::move(status), std::move(response_json));
 }
 
-void PythonEngine::Load(EngineLoadOption opts) {
-  // Develop register model here on loading engine
-};
-
-void PythonEngine::Unload(EngineUnloadOption opts) {
-  for (const auto& pair : models_) {
-    TerminateModelProcess(pair.first);
-  }
-};
-
-}  // namespace python_engine
\ No newline at end of file
+}  // namespace python_engine
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 76d82c961..bf993bcbe 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -10,7 +10,7 @@
 #include "config/model_config.h"
 #include "trantor/utils/ConcurrentTaskQueue.h"
 
-#include "cortex-common/EngineI.h"
+#include "cortex-common/python_enginei.h"
 #include "extensions/template_renderer.h"
 #include "utils/file_logger.h"
 #include "utils/file_manager_utils.h"
@@ -33,11 +33,11 @@ struct CurlResponse {
 };
 
 // UV-related functions
-cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService> download_service);
+cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service);
 std::string GetUvPath();
 bool IsUvInstalled();
 
-class PythonEngine : public EngineI {
+class PythonEngine : public PythonEngineI {
  private:
   // Model configuration
 
@@ -49,69 +49,18 @@ class PythonEngine : public EngineI {
   std::unordered_map<std::string, pid_t> process_map_;
   trantor::ConcurrentTaskQueue q_;
 
-  // Helper functions
-  CurlResponse MakePostRequest(const std::string& model,
-                               const std::string& path,
-                               const std::string& body);
-  CurlResponse MakeGetRequest(const std::string& model,
-                              const std::string& path);
-  CurlResponse MakeDeleteRequest(const std::string& model,
-                                 const std::string& path);
-  CurlResponse MakeStreamPostRequest(
-      const std::string& model, const std::string& path,
-      const std::string& body,
-      const std::function<void(Json::Value&&, Json::Value&&)>& callback);
-
-  // Process manager functions
-  bool TerminateModelProcess(const std::string& model);
-
-  // Internal model management
-  bool LoadModelConfig(const std::string& model, const std::string& yaml_path);
-  config::PythonModelConfig* GetModelConfig(const std::string& model);
-
  public:
   PythonEngine();
   ~PythonEngine();
 
-  void Load(EngineLoadOption opts) override;
-
-  void Unload(EngineUnloadOption opts) override;
-
-  // Main interface implementations
-  void GetModels(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-
-  void HandleChatCompletion(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-
   void LoadModel(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-
-  void UnloadModel(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 
-  void GetModelStatus(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-
-  // Other required virtual functions
-  void HandleEmbedding(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  bool IsSupported(const std::string& feature) override;
-  bool SetFileLogger(int max_log_lines, const std::string& log_path) override;
-  void SetLogLevel(trantor::Logger::LogLevel logLevel) override;
-  void HandleRouteRequest(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  void HandleInference(
+  void HandleRequest(
+      const std::string& model,
+      const std::vector<std::string>& path_parts,
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  Json::Value GetRemoteModels() override;
-  void StopInferencing(const std::string& model_id) override;
 };
-}  // namespace python_engine
\ No newline at end of file
+}  // namespace python_engine
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index 4ea9ebdfd..07bd3a306 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -182,6 +182,30 @@ cpp::result<void, InferResult> InferenceService::HandleRouteRequest(
   return {};
 }
 
+InferResult InferenceService::HandlePython(
+  const std::string& model, const std::vector<std::string>& path_parts,
+  std::shared_ptr<Json::Value> json_body) {
+
+  Json::Value stt, res;
+
+  auto engine_result = engine_service_->GetLoadedEngine(kPythonEngine);
+  if (engine_result.has_error()) {
+    res["message"] = "Python engine is not loaded yet";
+    stt["status_code"] = drogon::k400BadRequest;
+    LOG_WARN << "Python engine is not loaded yet";
+    return std::make_pair(stt, res);
+  }
+
+  auto cb = [&stt, &res](Json::Value s, Json::Value r) {
+    stt = s;
+    res = r;
+  };
+  std::get<PythonEngineI*>(engine_result.value())
+      ->HandleRequest(model, path_parts, json_body, cb);
+
+  return std::make_pair(stt, res);
+}
+
 InferResult InferenceService::LoadModel(
     std::shared_ptr<Json::Value> json_body) {
   std::string engine_type;
@@ -204,17 +228,20 @@ InferResult InferenceService::LoadModel(
   }
 
   // might need mutex here
-  auto engine_result = engine_service_->GetLoadedEngine(engine_type);
+  auto engine = engine_service_->GetLoadedEngine(engine_type).value();
 
   auto cb = [&stt, &r](Json::Value status, Json::Value res) {
     stt = status;
     r = res;
   };
-  if (std::holds_alternative<EngineI*>(engine_result.value())) {
-    std::get<EngineI*>(engine_result.value())
+  if (std::holds_alternative<EngineI*>(engine)) {
+    std::get<EngineI*>(engine)
+        ->LoadModel(json_body, std::move(cb));
+  } else if (std::holds_alternative<PythonEngineI*>(engine)) {
+    std::get<PythonEngineI*>(engine)
         ->LoadModel(json_body, std::move(cb));
   } else {
-    std::get<RemoteEngineI*>(engine_result.value())
+    std::get<RemoteEngineI*>(engine)
         ->LoadModel(json_body, std::move(cb));
   }
   if (!engine_service_->IsRemoteEngine(engine_type)) {
@@ -340,47 +367,8 @@ InferResult InferenceService::FineTuning(
   Json::Value r;
   Json::Value stt;
 
-  // TODO: namh refactor this
-  // if (engines_.find(ne) == engines_.end()) {
-  //   try {
-  //     std::string abs_path =
-  //         (getenv("ENGINE_PATH")
-  //              ? getenv("ENGINE_PATH")
-  //              : file_manager_utils::GetCortexDataPath().string()) +
-  //         kPythonRuntimeLibPath;
-  //     engines_[ne].dl = std::make_unique<cortex_cpp::dylib>(abs_path, "engine");
-  //   } catch (const cortex_cpp::dylib::load_error& e) {
-  //
-  //     LOG_ERROR << "Could not load engine: " << e.what();
-  //     engines_.erase(ne);
-  //
-  //     Json::Value res;
-  //     r["message"] = "Could not load engine " + ne;
-  //     stt["status_code"] = drogon::k500InternalServerError;
-  //     return std::make_pair(stt, r);
-  //   }
-  //
-  //   auto func =
-  //       engines_[ne].dl->get_function<CortexPythonEngineI*()>("get_engine");
-  //   engines_[ne].engine = func();
-  //   LOG_INFO << "Loaded engine: " << ne;
-  // }
-  //
-  // LOG_TRACE << "Start to fine-tuning";
-  // auto& en = std::get<CortexPythonEngineI*>(engines_[ne].engine);
-  // if (en->IsSupported("HandlePythonFileExecutionRequest")) {
-  //   en->HandlePythonFileExecutionRequest(
-  //       json_body, [&r, &stt](Json::Value status, Json::Value res) {
-  //         r = res;
-  //         stt = status;
-  //       });
-  // } else {
-  //   LOG_WARN << "Method is not supported yet";
   r["message"] = "Method is not supported yet";
   stt["status_code"] = drogon::k500InternalServerError;
-  //   return std::make_pair(stt, r);
-  // }
-  // LOG_TRACE << "Done fine-tuning";
   return std::make_pair(stt, r);
 }
 
diff --git a/engine/services/inference_service.h b/engine/services/inference_service.h
index 726275bba..874ce8c85 100644
--- a/engine/services/inference_service.h
+++ b/engine/services/inference_service.h
@@ -48,6 +48,10 @@ class InferenceService {
   cpp::result<void, InferResult> HandleRouteRequest(
       std::shared_ptr<SyncQueue> q, std::shared_ptr<Json::Value> json_body);
 
+  InferResult HandlePython(
+    const std::string& model, const std::vector<std::string>& path_parts,
+    std::shared_ptr<Json::Value> json_body);
+
   InferResult LoadModel(std::shared_ptr<Json::Value> json_body);
 
   InferResult UnloadModel(const std::string& engine,

From c5da0ee70e61e36a30d5018c5001f43ce95ac9ff Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 10:04:12 +0800
Subject: [PATCH 08/73] more checks to match all EngineV variants

---
 .../extensions/python-engine/python_engine.cc | 21 ++++++++++
 .../extensions/python-engine/python_engine.h  | 17 ++++++--
 engine/services/inference_service.cc          | 39 +++++++++++++++----
 3 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 7ab970127..0b8efa6e9 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -227,4 +227,25 @@ void PythonEngine::HandleRequest(
   callback(std::move(status), std::move(response_json));
 }
 
+void PythonEngine::UnloadModel(
+  std::shared_ptr<Json::Value> json_body,
+  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+
+  assert(false && "Not implemented");
+}
+
+void PythonEngine::GetModelStatus(
+  std::shared_ptr<Json::Value> json_body,
+  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+
+  assert(false && "Not implemented");
+}
+
+void PythonEngine::GetModels(
+  std::shared_ptr<Json::Value> jsonBody,
+  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+
+  assert(false && "Not implemented");
+}
+
 }  // namespace python_engine
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index bf993bcbe..717bb1b4e 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -56,11 +56,20 @@ class PythonEngine : public PythonEngineI {
   void LoadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  void UnloadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  void GetModelStatus(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  void GetModels(
+    std::shared_ptr<Json::Value> jsonBody,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 
   void HandleRequest(
-      const std::string& model,
-      const std::vector<std::string>& path_parts,
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+    const std::string& model,
+    const std::vector<std::string>& path_parts,
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 };
 }  // namespace python_engine
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index 07bd3a306..aac314399 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -4,6 +4,14 @@
 #include "utils/function_calling/common.h"
 #include "utils/jinja_utils.h"
 
+static InferResult GetUnsupportedResponse(const std::string& msg) {
+  Json::Value res, stt;
+  res["message"] = msg;
+  stt["status_code"] = drogon::k400BadRequest;
+  LOG_WARN << msg;
+  return std::make_pair(stt, res);
+}
+
 cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     std::shared_ptr<SyncQueue> q, std::shared_ptr<Json::Value> json_body) {
   std::string engine_type;
@@ -38,7 +46,7 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     LOG_WARN << "Engine is not loaded yet";
     return cpp::fail(std::make_pair(stt, res));
   }
- 
+
   if (!model_id.empty()) {
     if (auto model_service = model_service_.lock()) {
       auto metadata_ptr = model_service->GetCachedModelMetadata(model_id);
@@ -84,6 +92,9 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
   if (std::holds_alternative<EngineI*>(engine_result.value())) {
     std::get<EngineI*>(engine_result.value())
         ->HandleChatCompletion(json_body, std::move(cb));
+  } else if (std::holds_alternative<PythonEngineI*>(engine_result.value())) {
+    return cpp::fail(GetUnsupportedResponse(
+        "Python engine does not support Chat completion"));
   } else {
     std::get<RemoteEngineI*>(engine_result.value())
         ->HandleChatCompletion(json_body, std::move(cb));
@@ -117,6 +128,9 @@ cpp::result<void, InferResult> InferenceService::HandleEmbedding(
   if (std::holds_alternative<EngineI*>(engine_result.value())) {
     std::get<EngineI*>(engine_result.value())
         ->HandleEmbedding(json_body, std::move(cb));
+  } else if (std::holds_alternative<PythonEngineI*>(engine_result.value())) {
+    return cpp::fail(GetUnsupportedResponse(
+        "Python engine does not support Embedding"));
   } else {
     std::get<RemoteEngineI*>(engine_result.value())
         ->HandleEmbedding(json_body, std::move(cb));
@@ -274,11 +288,15 @@ InferResult InferenceService::UnloadModel(const std::string& engine_name,
     stt = status;
     r = res;
   };
-  if (std::holds_alternative<EngineI*>(engine_result.value())) {
-    std::get<EngineI*>(engine_result.value())
+  auto engine = engine_result.value();
+  if (std::holds_alternative<EngineI*>(engine)) {
+    std::get<EngineI*>(engine)
+        ->UnloadModel(std::make_shared<Json::Value>(json_body), std::move(cb));
+  } else if (std::holds_alternative<PythonEngineI*>(engine)) {
+    std::get<PythonEngineI*>(engine)
         ->UnloadModel(std::make_shared<Json::Value>(json_body), std::move(cb));
   } else {
-    std::get<RemoteEngineI*>(engine_result.value())
+    std::get<RemoteEngineI*>(engine)
         ->UnloadModel(std::make_shared<Json::Value>(json_body), std::move(cb));
   }
 
@@ -312,11 +330,15 @@ InferResult InferenceService::GetModelStatus(
     stt = status;
     r = res;
   };
-  if (std::holds_alternative<EngineI*>(engine_result.value())) {
-    std::get<EngineI*>(engine_result.value())
+  auto engine = engine_result.value();
+  if (std::holds_alternative<EngineI*>(engine)) {
+    std::get<EngineI*>(engine)
+        ->GetModelStatus(json_body, std::move(cb));
+  } else if (std::holds_alternative<PythonEngineI*>(engine)) {
+    std::get<PythonEngineI*>(engine)
         ->GetModelStatus(json_body, std::move(cb));
   } else {
-    std::get<RemoteEngineI*>(engine_result.value())
+    std::get<RemoteEngineI*>(engine)
         ->GetModelStatus(json_body, std::move(cb));
   }
 
@@ -348,6 +370,9 @@ InferResult InferenceService::GetModels(
       if (e->IsSupported("GetModels")) {
         e->GetModels(json_body, std::move(cb));
       }
+    } else if (std::holds_alternative<PythonEngineI*>(loaded_engine)) {
+      std::get<PythonEngineI*>(loaded_engine)
+          ->GetModels(json_body, std::move(cb));
     } else {
       std::get<RemoteEngineI*>(loaded_engine)
           ->GetModels(json_body, std::move(cb));

From 3c097fbdecd25e0ca40a9b0f8c045eba1ecda670 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 11:30:12 +0800
Subject: [PATCH 09/73] improve Python load model

---
 engine/cortex-common/python_enginei.h         |  18 +-
 .../extensions/python-engine/python_engine.cc | 177 +++++++-----------
 .../extensions/python-engine/python_engine.h  |  16 +-
 3 files changed, 84 insertions(+), 127 deletions(-)

diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h
index 31dc76c80..481b6b146 100644
--- a/engine/cortex-common/python_enginei.h
+++ b/engine/cortex-common/python_enginei.h
@@ -11,17 +11,19 @@ class PythonEngineI {
 
   // virtual bool IsSupported(const std::string& f) = 0;
 
-  // virtual void ExecutePythonFile(std::string binary_execute_path,
-  //                                std::string file_execution_path,
-  //                                std::string python_library_path) = 0;
-
-  // virtual void HandlePythonFileExecutionRequest(
-  //     std::shared_ptr<Json::Value> json_body,
-  //     std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-
+  // model management
   virtual void LoadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void UnloadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void GetModelStatus(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void GetModels(
+    std::shared_ptr<Json::Value> jsonBody,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
   virtual void HandleRequest(
       const std::string& model,
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 0b8efa6e9..49b2835a5 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -11,57 +11,6 @@ constexpr const int k400BadRequest = 400;
 constexpr const int k409Conflict = 409;
 constexpr const int k500InternalServerError = 500;
 constexpr const int kFileLoggerOption = 0;
-
-size_t StreamWriteCallback(char* ptr, size_t size, size_t nmemb,
-                           void* userdata) {
-  auto* context = static_cast<StreamContext*>(userdata);
-  std::string chunk(ptr, size * nmemb);
-
-  context->buffer += chunk;
-
-  // Process complete lines
-  size_t pos;
-  while ((pos = context->buffer.find('\n')) != std::string::npos) {
-    std::string line = context->buffer.substr(0, pos);
-    context->buffer = context->buffer.substr(pos + 1);
-    LOG_DEBUG << "line: " << line;
-
-    // Skip empty lines
-    if (line.empty() || line == "\r")
-      continue;
-
-    if (line == "data: [DONE]") {
-      Json::Value status;
-      status["is_done"] = true;
-      status["has_error"] = false;
-      status["is_stream"] = true;
-      status["status_code"] = 200;
-      (*context->callback)(std::move(status), Json::Value());
-      break;
-    }
-
-    // Parse the JSON
-    Json::Value chunk_json;
-    chunk_json["data"] = line + "\n\n";
-    Json::Reader reader;
-
-    Json::Value status;
-    status["is_done"] = false;
-    status["has_error"] = false;
-    status["is_stream"] = true;
-    status["status_code"] = 200;
-    (*context->callback)(std::move(status), std::move(chunk_json));
-  }
-
-  return size * nmemb;
-}
-
-static size_t WriteCallback(char* ptr, size_t size, size_t nmemb,
-                            std::string* data) {
-  data->append(ptr, size * nmemb);
-  return size * nmemb;
-}
-
 }  // namespace
 
 cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service) {
@@ -127,18 +76,35 @@ PythonEngine::~PythonEngine() {
   curl_global_cleanup();
 }
 
+static std::pair<Json::Value, Json::Value> CreateResponse(
+    const std::string& msg, int code) {
+
+  Json::Value status, res;
+  const bool has_error = code != k200OK;
+
+  status["is_done"] = true;
+  status["has_error"] = has_error;
+  status["is_stream"] = false;
+  status["status_code"] = code;
+
+  if (has_error) {
+    CTL_ERR(msg);
+    res["error"] = msg;
+  }
+  else {
+    res["status"] = msg;
+  }
+
+  return {status, res};
+}
+
 void PythonEngine::LoadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
   if (!json_body->isMember("model") || !json_body->isMember("model_dir")) {
-    Json::Value error;
-    error["error"] = "Missing required fields: model or model_dir";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
+    auto [status, error] = CreateResponse(
+      "Missing required fields: model or model_dir", k400BadRequest);
     callback(std::move(status), std::move(error));
     return;
   }
@@ -148,24 +114,34 @@ void PythonEngine::LoadModel(
   const std::string& model = (*json_body)["model"].asString();
   const fs::path model_dir = (*json_body)["model_dir"].asString();
 
-  if (models_.find(model) != models_.end()) {
-    Json::Value error;
-    error["error"] = "Model already loaded!";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k409Conflict;
+  if (model_process_map.find(model) != model_process_map.end()) {
+    auto [status, error] = CreateResponse(
+      "Model already loaded!", k409Conflict);
     callback(std::move(status), std::move(error));
     return;
   }
 
   pid_t pid;
-
   try {
-    std::vector<std::string> command{GetUvPath(), "run", model_dir / "main.py"};
+    auto model_config = YAML::LoadFile(model_dir / "model.yml");
+    if (!model_config["entrypoint"])
+      throw std::runtime_error("`entrypoint` is not defined in model.yml");
+    if (!model_config["port"])
+      throw std::runtime_error("`port` is not defined in model.yaml");
+
+    const std::string entrypoint = model_config["entrypoint"].as<std::string>();
+    const int port = model_config["port"].as<int>();
+
+    // NOTE: model_dir / entrypoint assumes a Python script
+    // TODO: figure out if we can support arbitrary CLI (but still launch by uv)
+    std::vector<std::string> command{GetUvPath(), "run", model_dir / entrypoint};
+
+    auto extra_args_node = model_config["extra_args"];
+    if (extra_args_node && extra_args_node.IsSequence()) {
+      for (int i = 0; i < extra_args_node.size(); i++)
+        command.push_back(extra_args_node[i].as<std::string>());
+    }
 
-    // TODO: what happens if the process exits?
     const std::string stdout_path = model_dir / "stdout.txt";
     const std::string stderr_path = model_dir / "stderr.txt";
 
@@ -173,58 +149,25 @@ void PythonEngine::LoadModel(
     if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush();
     if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush();
 
+    // TODO: what happens if the process starts, but exits?
     pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path);
-
-    process_map_[model] = pid;
     if (pid == -1) {
       throw std::runtime_error("Fail to spawn process with pid -1");
     }
-  } catch (const std::exception& e) {
-    std::unique_lock lock(models_mutex_);
-    if (models_.find(model) != models_.end()) {
-      models_.erase(model);
-    }
+    std::unique_lock write_lock(mutex);
+    model_process_map[model] = {pid, port};
 
-    Json::Value error;
-    error["error"] = e.what();
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k500InternalServerError;
+  } catch (const std::exception& e) {
+    auto e_msg = e.what();
+    auto [status, error] = CreateResponse(e_msg, k500InternalServerError);
     callback(std::move(status), std::move(error));
     return;
   }
 
-  Json::Value response;
-  response["status"] =
-      "Model loaded successfully with pid: " + std::to_string(pid);
-  Json::Value status;
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-  callback(std::move(status), std::move(response));
-}
-
-void PythonEngine::HandleRequest(
-  const std::string& model,
-  const std::vector<std::string>& path_parts,
-  std::shared_ptr<Json::Value> json_body,
-  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  // get port
-
-  Json::Value response_json;
-  response_json["object"] = "list";
-
-  Json::Value status;
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-
-  callback(std::move(status), std::move(response_json));
+  auto [status, res] = CreateResponse(
+    "Model loaded successfully with pid: " + std::to_string(pid),
+    k200OK);
+  callback(std::move(status), std::move(res));
 }
 
 void PythonEngine::UnloadModel(
@@ -248,4 +191,14 @@ void PythonEngine::GetModels(
   assert(false && "Not implemented");
 }
 
+void PythonEngine::HandleRequest(
+  const std::string& model,
+  const std::vector<std::string>& path_parts,
+  std::shared_ptr<Json::Value> json_body,
+  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+
+  assert(false && "Not implemented");
+  // get port
+}
+
 }  // namespace python_engine
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 717bb1b4e..553f49b9b 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -39,14 +39,16 @@ bool IsUvInstalled();
 
 class PythonEngine : public PythonEngineI {
  private:
-  // Model configuration
+  // extensions::TemplateRenderer renderer_;
+  // std::unique_ptr<trantor::FileLogger> async_file_logger_;
 
-  // Thread-safe model config storage
-  mutable std::shared_mutex models_mutex_;
-  std::unordered_map<std::string, config::PythonModelConfig> models_;
-  extensions::TemplateRenderer renderer_;
-  std::unique_ptr<trantor::FileLogger> async_file_logger_;
-  std::unordered_map<std::string, pid_t> process_map_;
+  struct PythonSubprocess {
+    pid_t pid;
+    int port;
+  };
+
+  mutable std::shared_mutex mutex;
+  std::unordered_map<std::string, PythonSubprocess> model_process_map;
   trantor::ConcurrentTaskQueue q_;
 
  public:

From 84db8b0857857bad11d66fe751c8ab09f401b3e5 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 14:40:15 +0800
Subject: [PATCH 10/73] consolidate process-related functions

---
 engine/utils/process/utils.cc | 69 ++++++++++++++++++++++++++++++++++-
 engine/utils/process/utils.h  |  2 +
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 94433367b..624b62262 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -2,8 +2,11 @@
 #include "utils/logging_utils.h"
 #include <filesystem>
 
-#if defined(__APPLE__) || defined(__linux__)
+#ifdef _WIN32
+#include <tlhelp32.h>
+#elif defined(__APPLE__) || defined(__linux__)
 extern char **environ;  // environment variables
+#include <errno.h>
 #include <fcntl.h>
 #endif
 
@@ -138,4 +141,66 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
   }
 }
 
-}  // namespace cortex::process
\ No newline at end of file
+bool IsProcessAlive(pid_t pid) {
+#ifdef _WIN32
+  // Windows implementation
+  HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);
+  if (snapshot == INVALID_HANDLE_VALUE) {
+    return false;
+  }
+
+  PROCESSENTRY32 processEntry = {0};
+  processEntry.dwSize = sizeof(processEntry);
+
+  if (Process32First(snapshot, &processEntry)) {
+    do {
+      if (processEntry.th32ProcessID == pid) {
+        CloseHandle(snapshot);
+        return true;
+      }
+    } while (Process32Next(snapshot, &processEntry));
+  }
+
+  CloseHandle(snapshot);
+  return false;
+
+#elif defined(__APPLE__) || defined(__linux__)
+  // Unix-like systems (Linux and macOS) implementation
+  if (pid <= 0) {
+    return false;
+  }
+
+  // Try to send signal 0 to the process
+  // This doesn't actually send a signal but checks if we can send signals to the process
+  int result = kill(pid, 0);
+
+  if (result == 0) {
+    return true;  // Process exists and we have permission to send it signals
+  }
+
+  return errno != ESRCH;  // ESRCH means "no such process"
+#else
+#error "Unsupported platform"
+#endif
+}
+
+bool KillProcess(pid_t pid) {
+#if defined(_WIN32)
+  HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, pid);
+  if (hProcess == NULL) {
+    LOG_ERROR << "Failed to open process";
+    return false;
+  }
+
+  bool is_success = TerminateProcess(hProcess, 0) == TRUE;
+  CloseHandle(hProcess);
+  return is_success;
+#elif defined(__APPLE__) || defined(__linux__)
+  // NOTE: should we use SIGKILL here to be consistent with Windows?
+  return kill(pid, SIGTERM) == 0;
+#else
+#error "Unsupported platform"
+#endif
+}
+
+}  // namespace cortex::process
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index 54f34e919..813d53750 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -23,5 +23,7 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 pid_t SpawnProcess(const std::vector<std::string>& command,
                    const std::optional<std::string> stdout_file = {},
                    const std::optional<std::string> stderr_file = {});
+bool IsProcessAlive(pid_t pid);
+bool KillProcess(pid_t pid);
 
 }

From 8ee815c8804b61cd7b350e278a635ec608f8c57f Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 14:41:10 +0800
Subject: [PATCH 11/73] update PythonModelConfig. add UnloadModel

---
 engine/config/model_config.h                  | 347 ++++--------------
 engine/controllers/models.cc                  |   2 +-
 .../extensions/python-engine/python_engine.cc |  98 ++++-
 .../extensions/python-engine/python_engine.h  |   3 +
 engine/services/model_service.cc              |  66 +---
 5 files changed, 155 insertions(+), 361 deletions(-)

diff --git a/engine/config/model_config.h b/engine/config/model_config.h
index 1d51cfb01..85335c37b 100644
--- a/engine/config/model_config.h
+++ b/engine/config/model_config.h
@@ -478,108 +478,41 @@ struct Endpoint {
 
 struct PythonModelConfig {
   // General Metadata
-  std::string id;
-  std::string model;
   std::string name;
   int version;
 
-  // Inference Parameters
-  Endpoint load_model;
-  Endpoint destroy;
-  Endpoint inference;
-  Endpoint heath_check;
-  std::vector<Endpoint> extra_endpoints;
-
   // Model Load Parameters
-  std::string port;
-  std::string script;
-  std::string log_path;
-  std::string log_level;
-  std::string environment;
-  std::vector<std::string> command;  // New command field
-  std::vector<std::string> files;
-  std::vector<std::string> depends;
   std::string engine;
-  Json::Value extra_params;  // Accept dynamic extra parameters
+  std::string entrypoint;
+  int port;
+  std::vector<std::string> extra_args;
 
   // Method to convert C++ struct to YAML
   void ToYaml(const std::string& filepath) const {
     YAML::Emitter out;
     out << YAML::BeginMap;
 
-    out << YAML::Key << "id" << YAML::Value << id;
-    out << YAML::Key << "model" << YAML::Value << model;
+    // General Metadata
     out << YAML::Key << "name" << YAML::Value << name;
     out << YAML::Key << "version" << YAML::Value << version;
 
-    // Inference Parameters
-    out << YAML::Key << "load_model" << YAML::Value << YAML::BeginMap;
-    out << YAML::Key << "method" << YAML::Value << load_model.method;
-    out << YAML::Key << "path" << YAML::Value << load_model.path;
-    out << YAML::Key << "transform_request" << YAML::Value
-        << load_model.transform_request;
-    out << YAML::Key << "transform_response" << YAML::Value
-        << load_model.transform_response;
-    out << YAML::EndMap;
-
-    out << YAML::Key << "destroy" << YAML::Value << YAML::BeginMap;
-    out << YAML::Key << "method" << YAML::Value << destroy.method;
-    out << YAML::Key << "path" << YAML::Value << destroy.path;
-    out << YAML::EndMap;
-
-    out << YAML::Key << "inference" << YAML::Value << YAML::BeginMap;
-    out << YAML::Key << "method" << YAML::Value << inference.method;
-    out << YAML::Key << "path" << YAML::Value << inference.path;
-    out << YAML::EndMap;
-
-    out << YAML::Key << "extra_endpoints" << YAML::Value << YAML::BeginSeq;
-    for (const auto& endpoint : extra_endpoints) {
-      out << YAML::BeginMap;
-      out << YAML::Key << "method" << YAML::Value << endpoint.method;
-      out << YAML::Key << "path" << YAML::Value << endpoint.path;
-      out << YAML::EndMap;
-    }
-    out << YAML::EndSeq;
-
     // Model Load Parameters
+    out << YAML::Key << "engine" << YAML::Value << engine;
+    out << YAML::Key << "entrypoint" << YAML::Value << entrypoint;
     out << YAML::Key << "port" << YAML::Value << port;
-    out << YAML::Key << "script" << YAML::Value << script;
-    out << YAML::Key << "log_path" << YAML::Value << log_path;
-    out << YAML::Key << "log_level" << YAML::Value << log_level;
-    out << YAML::Key << "environment" << YAML::Value << environment;
-
-    // Serialize command as YAML list
-    out << YAML::Key << "command" << YAML::Value << YAML::BeginSeq;
-    for (const auto& cmd : command) {
-      out << cmd;
-    }
-    out << YAML::EndSeq;
 
-    // Serialize files as YAML list
-    out << YAML::Key << "files" << YAML::Value << YAML::BeginSeq;
-    for (const auto& file : files) {
-      out << file;
-    }
-    out << YAML::EndSeq;
-
-    // Serialize command as YAML list
-    out << YAML::Key << "depends" << YAML::Value << YAML::BeginSeq;
-    for (const auto& depend : depends) {
-      out << depend;
+    // Extra Arguments
+    if (!extra_args.empty()) {
+      out << YAML::Key << "extra_args" << YAML::Value << YAML::BeginSeq;
+      for (const auto& arg : extra_args) {
+        out << arg;
+      }
+      out << YAML::EndSeq;
     }
-    out << YAML::EndSeq;
-
-    out << YAML::Key << "engine" << YAML::Value << engine;
 
-    // Serialize extra_params as YAML
-    out << YAML::Key << "extra_params" << YAML::Value << YAML::BeginMap;
-    for (Json::ValueConstIterator iter = extra_params.begin();
-         iter != extra_params.end(); ++iter) {
-      out << YAML::Key << iter.key().asString() << YAML::Value
-          << iter->asString();
-    }
     out << YAML::EndMap;
 
+    // Write to file
     std::ofstream fout(filepath);
     if (!fout.is_open()) {
       throw std::runtime_error("Failed to open file for writing: " + filepath);
@@ -589,218 +522,82 @@ struct PythonModelConfig {
 
   // Method to populate struct from YAML file
   void ReadFromYaml(const std::string& filePath) {
-    YAML::Node config = YAML::LoadFile(filePath);
-
-    if (config["id"])
-      id = config["id"].as<std::string>();
-    if (config["model"])
-      model = config["model"].as<std::string>();
-    if (config["name"])
-      name = config["name"].as<std::string>();
-    if (config["version"])
-      version = config["version"].as<int>();
-
-    // Inference Parameters
-
-    auto ip = config;
-    if (ip["load_model"]) {
-      load_model.method = ip["load_model"]["method"].as<std::string>();
-      load_model.path = ip["load_model"]["path"].as<std::string>();
-      load_model.transform_request =
-          ip["load_model"]["transform_request"].as<std::string>();
-      load_model.transform_response =
-          ip["load_model"]["transform_response"].as<std::string>();
-    }
-    if (ip["destroy"]) {
-      destroy.method = ip["destroy"]["method"].as<std::string>();
-      destroy.path = ip["destroy"]["path"].as<std::string>();
-    }
-    if (ip["inference"]) {
-      inference.method = ip["inference"]["method"].as<std::string>();
-      inference.path = ip["inference"]["path"].as<std::string>();
-    }
-    if (ip["extra_endpoints"] && ip["extra_endpoints"].IsSequence()) {
-      for (const auto& endpoint : ip["extra_endpoints"]) {
-        Endpoint e;
-        e.method = endpoint["method"].as<std::string>();
-        e.path = endpoint["path"].as<std::string>();
-        extra_endpoints.push_back(e);
-      }
-    }
-
-    // Model Load Parameters
-
-    auto mlp = config;
-    if (mlp["port"])
-      port = mlp["port"].as<std::string>();
-    if (mlp["script"])
-      script = mlp["script"].as<std::string>();
-    if (mlp["log_path"])
-      log_path = mlp["log_path"].as<std::string>();
-    if (mlp["log_level"])
-      log_level = mlp["log_level"].as<std::string>();
-    if (mlp["environment"])
-      environment = mlp["environment"].as<std::string>();
-    if (mlp["engine"])
-      engine = mlp["engine"].as<std::string>();
-
-    if (mlp["command"] && mlp["command"].IsSequence()) {
-      for (const auto& cmd : mlp["command"]) {
-        command.push_back(cmd.as<std::string>());
+    try {
+      YAML::Node config = YAML::LoadFile(filePath);
+
+      // General Metadata
+      if (config["name"]) name = config["name"].as<std::string>();
+      if (config["version"]) version = config["version"].as<int>();
+
+      // Model Load Parameters
+      if (config["engine"]) engine = config["engine"].as<std::string>();
+      if (config["entrypoint"]) entrypoint = config["entrypoint"].as<std::string>();
+      if (config["port"]) port = config["port"].as<int>();
+
+      // Extra Arguments
+      if (config["extra_args"] && config["extra_args"].IsSequence()) {
+        extra_args.clear();
+        for (const auto& arg : config["extra_args"]) {
+          extra_args.push_back(arg.as<std::string>());
+        }
       }
     }
-
-    if (mlp["files"] && mlp["files"].IsSequence()) {
-      for (const auto& file : mlp["files"]) {
-        files.push_back(file.as<std::string>());
-      }
+    catch (const YAML::Exception& e) {
+      throw std::runtime_error("Error parsing YAML file: " + std::string(e.what()));
     }
-
-    if (mlp["depends"] && mlp["depends"].IsSequence()) {
-      for (const auto& depend : mlp["depends"]) {
-        depends.push_back(depend.as<std::string>());
-      }
-    }
-
-    if (mlp["extra_params"]) {
-      for (YAML::const_iterator it = mlp["extra_params"].begin();
-           it != mlp["extra_params"].end(); ++it) {
-        extra_params[it->first.as<std::string>()] =
-            it->second.as<std::string>();
-      }
+    catch (const std::exception& e) {
+      throw std::runtime_error("Error reading YAML file: " + std::string(e.what()));
     }
   }
 
   // Method to convert the struct to JSON
   Json::Value ToJson() const {
-    Json::Value root;
-
-    root["id"] = id;
-    root["model"] = model;
-    root["name"] = name;
-    root["version"] = version;
-
-    // Inference Parameters
-    root["load_model"]["method"] = load_model.method;
-    root["load_model"]["path"] = load_model.path;
-    root["load_model"]["transform_request"] = load_model.transform_request;
-    root["load_model"]["transform_response"] = load_model.transform_response;
-
-    root["destroy"]["method"] = destroy.method;
-    root["destroy"]["path"] = destroy.path;
-
-    root["inference"]["method"] = inference.method;
-    root["inference"]["path"] = inference.path;
-
-    for (const auto& endpoint : extra_endpoints) {
-      Json::Value e;
-      e["method"] = endpoint.method;
-      e["path"] = endpoint.path;
-      root["extra_endpoints"].append(e);
-    }
-
-    // Model Load Parameters
-    root["port"] = port;
-    root["log_path"] = log_path;
-    root["log_level"] = log_level;
-    root["environment"] = environment;
-    root["script"] = script;
-
-    // Serialize command as JSON array
-    for (const auto& cmd : command) {
-      root["command"].append(cmd);
-    }
+    Json::Value json;
 
-    for (const auto& file : files) {
-      root["files"].append(file);
-    }
+    // Add basic string fields
+    json["name"] = name;
+    json["version"] = version;
+    json["engine"] = engine;
+    json["entrypoint"] = entrypoint;
+    json["port"] = port;
 
-    for (const auto& depend : depends) {
-      root["depends"].append(depend);
+    // Add extra_args array
+    if (!extra_args.empty()) {
+        Json::Value args(Json::arrayValue);
+        for (const auto& arg : extra_args) {
+            args.append(arg);
+        }
+        json["extra_args"] = args;
     }
 
-    root["engine"] = engine;
-    root["extra_params"] = extra_params;  // Serialize the JSON value directly
-
-    return root;
+    return json;
   }
 
   // Method to populate struct from JSON
   void FromJson(const Json::Value& root) {
-
-    if (root.isMember("id"))
-      id = root["id"].asString();
-    if (root.isMember("model"))
-      model = root["model"].asString();
-    if (root.isMember("name"))
-      name = root["name"].asString();
-    if (root.isMember("version"))
-      version = root["version"].asInt();
-
-    // Inference Parameters
-
-    const Json::Value& ip = root;
-    if (ip.isMember("load_model")) {
-      load_model.method = ip["load_model"]["method"].asString();
-      load_model.path = ip["load_model"]["path"].asString();
-      load_model.transform_request =
-          ip["load_model"]["transform_request"].asString();
-      load_model.transform_response =
-          ip["load_model"]["transform_response"].asString();
-    }
-    if (ip.isMember("destroy")) {
-      destroy.method = ip["destroy"]["method"].asString();
-      destroy.path = ip["destroy"]["path"].asString();
-    }
-    if (ip.isMember("inference")) {
-      inference.method = ip["inference"]["method"].asString();
-      inference.path = ip["inference"]["path"].asString();
-    }
-    if (ip.isMember("extra_endpoints")) {
-      for (const auto& endpoint : ip["extra_endpoints"]) {
-        Endpoint e;
-        e.method = endpoint["method"].asString();
-        e.path = endpoint["path"].asString();
-        extra_endpoints.push_back(e);
-      }
-    }
-
-    // Model Load Parameters
-
-    const Json::Value& mlp = root;
-    if (mlp.isMember("port"))
-      port = mlp["port"].asString();
-    if (mlp.isMember("log_path"))
-      log_path = mlp["log_path"].asString();
-    if (mlp.isMember("log_level"))
-      log_level = mlp["log_level"].asString();
-    if (mlp.isMember("environment"))
-      environment = mlp["environment"].asString();
-    if (mlp.isMember("engine"))
-      engine = mlp["engine"].asString();
-    if (mlp.isMember("script"))
-      script = mlp["script"].asString();
-
-    if (mlp.isMember("command")) {
-      for (const auto& cmd : mlp["command"]) {
-        command.push_back(cmd.asString());
-      }
-    }
-
-    if (mlp.isMember("files")) {
-      for (const auto& file : mlp["files"]) {
-        files.push_back(file.asString());
-      }
+    if (!root.isObject()) {
+      throw std::runtime_error("Input JSON must be an object");
     }
-
-    if (mlp.isMember("depends")) {
-      for (const auto& depend : mlp["depends"]) {
-        depends.push_back(depend.asString());
+    try {
+      // Basic fields
+      name = root.get("name", name).asString();
+      version = root.get("version", version).asInt();
+      engine = root.get("engine", engine).asString();
+      entrypoint = root.get("entrypoint", entrypoint).asString();
+      port = root.get("port", port).asInt();
+
+      // Extra args array
+      extra_args.clear();
+      const Json::Value& args = root["extra_args"];
+      if (args.isArray()) {
+        for (const auto& arg : args) {
+          extra_args.push_back(arg.asString());
+        }
       }
-    }
-
-    if (mlp.isMember("extra_params")) {
-      extra_params = mlp["extra_params"];  // Directly assign the JSON value
+    } catch (const Json::Exception& e) {
+      throw std::runtime_error("Error parsing JSON: " + std::string(e.what()));
+    } catch (const std::exception& e) {
+      throw std::runtime_error("Error processing JSON data: " + std::string(e.what()));
     }
   }
 };
diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
index ac1f55d8f..1f0bb38ce 100644
--- a/engine/controllers/models.cc
+++ b/engine/controllers/models.cc
@@ -309,7 +309,7 @@ void Models::GetModel(const HttpRequestPtr& req,
               fs::path(model_entry.value().path_to_model_yaml))
               .string());
       ret = python_model_config.ToJson();
-      ret["id"] = python_model_config.model;
+      ret["id"] = python_model_config.name;
       ret["object"] = "model";
       ret["result"] = "OK";
       auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret);
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 49b2835a5..51d047310 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -65,11 +65,17 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
 std::string GetUvPath() {
   return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
 }
-
 bool IsUvInstalled() {
   return std::filesystem::exists(GetUvPath());
 }
 
+bool PythonEngine::PythonSubprocess::IsAlive() {
+  return cortex::process::IsProcessAlive(pid);
+}
+bool PythonEngine::PythonSubprocess::Kill() {
+  return cortex::process::KillProcess(pid);
+}
+
 PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {}
 
 PythonEngine::~PythonEngine() {
@@ -111,7 +117,7 @@ void PythonEngine::LoadModel(
 
   namespace fs = std::filesystem;
 
-  const std::string& model = (*json_body)["model"].asString();
+  const std::string model = (*json_body)["model"].asString();
   const fs::path model_dir = (*json_body)["model_dir"].asString();
 
   if (model_process_map.find(model) != model_process_map.end()) {
@@ -123,24 +129,14 @@ void PythonEngine::LoadModel(
 
   pid_t pid;
   try {
-    auto model_config = YAML::LoadFile(model_dir / "model.yml");
-    if (!model_config["entrypoint"])
-      throw std::runtime_error("`entrypoint` is not defined in model.yml");
-    if (!model_config["port"])
-      throw std::runtime_error("`port` is not defined in model.yaml");
-
-    const std::string entrypoint = model_config["entrypoint"].as<std::string>();
-    const int port = model_config["port"].as<int>();
+    config::PythonModelConfig py_cfg;
+    py_cfg.ReadFromYaml(model_dir / "model.yml");
 
     // NOTE: model_dir / entrypoint assumes a Python script
     // TODO: figure out if we can support arbitrary CLI (but still launch by uv)
-    std::vector<std::string> command{GetUvPath(), "run", model_dir / entrypoint};
-
-    auto extra_args_node = model_config["extra_args"];
-    if (extra_args_node && extra_args_node.IsSequence()) {
-      for (int i = 0; i < extra_args_node.size(); i++)
-        command.push_back(extra_args_node[i].as<std::string>());
-    }
+    std::vector<std::string> command{GetUvPath(), "run", model_dir / py_cfg.entrypoint};
+    for (const auto& item : py_cfg.extra_args)
+      command.push_back(item);
 
     const std::string stdout_path = model_dir / "stdout.txt";
     const std::string stderr_path = model_dir / "stderr.txt";
@@ -155,7 +151,7 @@ void PythonEngine::LoadModel(
       throw std::runtime_error("Fail to spawn process with pid -1");
     }
     std::unique_lock write_lock(mutex);
-    model_process_map[model] = {pid, port};
+    model_process_map[model] = {pid, py_cfg.port};
 
   } catch (const std::exception& e) {
     auto e_msg = e.what();
@@ -174,7 +170,56 @@ void PythonEngine::UnloadModel(
   std::shared_ptr<Json::Value> json_body,
   std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
-  assert(false && "Not implemented");
+  if (!json_body->isMember("model")) {
+    auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+
+  // check if model has started
+  {
+    std::shared_lock read_lock(mutex);
+
+    if (model_process_map.find(model) == model_process_map.end()) {
+      const std::string msg = "Model " + model + " has not been loaded yet.";
+      auto [status, error] = CreateResponse(msg, k400BadRequest);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+  }
+
+  // we know that model has started
+  {
+    std::unique_lock write_lock(mutex);
+
+    // check if subprocess is still alive
+    if (!model_process_map[model].IsAlive()) {
+      const std::string msg = "Model " + model + " stopped running.";
+      auto [status, error] = CreateResponse(msg, k400BadRequest);
+
+      // NOTE: do we need to do any other cleanup for subprocesses?
+      model_process_map.erase(model);
+
+      callback(std::move(status), std::move(error));
+      return;
+    }
+
+    // subprocess is alive. we kill it here.
+    if (!model_process_map[model].Kill()) {
+      const std::string msg = "Unable to kill process of model " + model;
+      auto [status, error] = CreateResponse(msg, k500InternalServerError);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+
+    // NOTE: do we need to do any other cleanup for subprocesses?
+    model_process_map.erase(model);
+  }
+
+  auto [status, res] = CreateResponse("Unload model successfully", k200OK);
+  callback(std::move(status), std::move(res));
 }
 
 void PythonEngine::GetModelStatus(
@@ -188,7 +233,20 @@ void PythonEngine::GetModels(
   std::shared_ptr<Json::Value> jsonBody,
   std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
-  assert(false && "Not implemented");
+  Json::Value res, model_list(Json::arrayValue), status;
+  for (const auto& item : model_process_map) {
+    model_list.append(Json::Value{item.first});
+  }
+
+  res["object"] = "list";
+  res["data"] = model_list;
+
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = k200OK;
+
+  callback(std::move(status), std::move(res));
 }
 
 void PythonEngine::HandleRequest(
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 553f49b9b..71fd170e7 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -45,6 +45,9 @@ class PythonEngine : public PythonEngineI {
   struct PythonSubprocess {
     pid_t pid;
     int port;
+
+    bool IsAlive();
+    bool Kill();
   };
 
   mutable std::shared_mutex mutex;
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 15d5a8dc6..00bb464dc 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -506,57 +506,8 @@ ModelService::DownloadModelFromCortexsoAsync(
     config::YamlHandler yaml_handler;
     yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string());
     auto mc = yaml_handler.GetModelConfig();
-    if (mc.engine == kPythonEngine) {  // process for Python engine
-      config::PythonModelConfig python_model_config;
-      python_model_config.ReadFromYaml(model_yml_item->localPath.string());
-      python_model_config.files.push_back(
-          model_yml_item->localPath.parent_path().string());
-      python_model_config.ToYaml(model_yml_item->localPath.string());
-      // unzip venv.zip
-      auto model_folder = model_yml_item->localPath.parent_path();
-      auto venv_path = model_folder / std::filesystem::path("venv");
-      if (!std::filesystem::exists(venv_path)) {
-        std::filesystem::create_directories(venv_path);
-      }
-      auto venv_zip = model_folder / std::filesystem::path("venv.zip");
-      if (std::filesystem::exists(venv_zip)) {
-        if (archive_utils::ExtractArchive(venv_zip.string(),
-                                          venv_path.string())) {
-          std::filesystem::remove_all(venv_zip);
-          CTL_INF("Successfully extract venv.zip");
-          // If extract success create pyvenv.cfg
-          std::ofstream pyvenv_cfg(venv_path /
-                                   std::filesystem::path("pyvenv.cfg"));
-#ifdef _WIN32
-          pyvenv_cfg << "home = "
-                     << (venv_path / std::filesystem::path("Scripts")).string()
-                     << std::endl;
-          pyvenv_cfg << "executable = "
-                     << (venv_path / std::filesystem::path("Scripts") /
-                         std::filesystem::path("python.exe"))
-                            .string()
-                     << std::endl;
-#else
-          pyvenv_cfg << "home = "
-                     << (venv_path / std::filesystem::path("bin/")).string()
-                     << std::endl;
-          pyvenv_cfg
-              << "executable = "
-              << (venv_path / std::filesystem::path("bin/python")).string()
-              << std::endl;
-#endif
-          // Close the file
-          pyvenv_cfg.close();
-          // Add executable permission to python
-          set_permission_utils::SetExecutePermissionsRecursive(venv_path);
-        } else {
-          CTL_ERR("Failed to extract venv.zip");
-        };
 
-      } else {
-        CTL_ERR(
-            "venv.zip not found in model folder: " << model_folder.string());
-      }
+    if (mc.engine == kPythonEngine) {  // process for Python engine
 
     } else {
       mc.model = unique_model_id;
@@ -986,21 +937,6 @@ cpp::result<bool, std::string> ModelService::StopModel(
       engine_name = kLlamaEngine;
     }
 
-    // Update for python engine
-    if (engine_name == kPythonEngine) {
-      auto model_entry = db_service_->GetModelInfo(model_handle);
-      config::PythonModelConfig python_model_config;
-      python_model_config.ReadFromYaml(
-          fmu::ToAbsoluteCortexDataPath(
-              fs::path(model_entry.value().path_to_model_yaml))
-              .string());
-      // Stop all depends model
-      auto depends = python_model_config.depends;
-      for (auto& depend : depends) {
-        StopModel(depend);
-      }
-    }
-
     //
     assert(inference_svc_);
     auto ir = inference_svc_->UnloadModel(engine_name, model_handle);

From 29f53444a4ce4b00624c8a2d2e14c0e9f04a2e0d Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 15:09:22 +0800
Subject: [PATCH 12/73] implement PythonEngine::GetModels

---
 engine/extensions/python-engine/python_engine.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 51d047310..d15355ed3 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -234,8 +234,16 @@ void PythonEngine::GetModels(
   std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
   Json::Value res, model_list(Json::arrayValue), status;
-  for (const auto& item : model_process_map) {
-    model_list.append(Json::Value{item.first});
+  {
+    std::shared_lock read_lock(mutex);
+    for (const auto& [model_name, py_proc] : model_process_map) {
+      Json::Value val;
+      val["id"] = model_name;
+      val["engine"] = kPythonEngine;
+      val["port"] = py_proc.port;
+      val["object"] = "model";
+      model_list.append(val);
+    }
   }
 
   res["object"] = "list";

From 7949dccd9816353f96ec185c8e2ea86e8409cbb8 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 15:46:38 +0800
Subject: [PATCH 13/73] implement getModelStatus. add some notes

---
 .../extensions/python-engine/python_engine.cc | 59 ++++++++++++++++---
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index d15355ed3..113b68bf7 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -120,11 +120,15 @@ void PythonEngine::LoadModel(
   const std::string model = (*json_body)["model"].asString();
   const fs::path model_dir = (*json_body)["model_dir"].asString();
 
-  if (model_process_map.find(model) != model_process_map.end()) {
-    auto [status, error] = CreateResponse(
-      "Model already loaded!", k409Conflict);
-    callback(std::move(status), std::move(error));
-    return;
+  // TODO: check if model is still alive
+  {
+    std::shared_lock read_lock(mutex);
+    if (model_process_map.find(model) != model_process_map.end()) {
+      auto [status, error] = CreateResponse(
+        "Model already loaded!", k409Conflict);
+      callback(std::move(status), std::move(error));
+      return;
+    }
   }
 
   pid_t pid;
@@ -181,7 +185,6 @@ void PythonEngine::UnloadModel(
   // check if model has started
   {
     std::shared_lock read_lock(mutex);
-
     if (model_process_map.find(model) == model_process_map.end()) {
       const std::string msg = "Model " + model + " has not been loaded yet.";
       auto [status, error] = CreateResponse(msg, k400BadRequest);
@@ -226,7 +229,48 @@ void PythonEngine::GetModelStatus(
   std::shared_ptr<Json::Value> json_body,
   std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
-  assert(false && "Not implemented");
+  if (!json_body->isMember("model")) {
+    auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+  Json::Value res, status;
+
+  // check if model has started
+  {
+    std::shared_lock read_lock(mutex);
+    if (model_process_map.find(model) == model_process_map.end()) {
+      const std::string msg = "Model " + model + " has not been loaded yet.";
+      auto [status, error] = CreateResponse(msg, k400BadRequest);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+  }
+
+  // we know that model has started
+  {
+    std::unique_lock write_lock(mutex);
+
+    // check if subprocess is still alive
+    if (!model_process_map[model].IsAlive()) {
+      const std::string msg = "Model " + model + " stopped running.";
+      auto [status, error] = CreateResponse(msg, k400BadRequest);
+
+      // NOTE: do we need to do any other cleanup for subprocesses?
+      model_process_map.erase(model);
+
+      callback(std::move(status), std::move(error));
+      return;
+    }
+  }
+
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = k200OK;
+  callback(std::move(status), std::move(res));
 }
 
 void PythonEngine::GetModels(
@@ -237,6 +281,7 @@ void PythonEngine::GetModels(
   {
     std::shared_lock read_lock(mutex);
     for (const auto& [model_name, py_proc] : model_process_map) {
+      // TODO: check if py_proc is still alive
       Json::Value val;
       val["id"] = model_name;
       val["engine"] = kPythonEngine;

From e2f0323988a34a43611411e6ed7544a7e0db919a Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 17:27:38 +0800
Subject: [PATCH 14/73] add router for python

---
 engine/controllers/server.cc                  | 50 +++++++++++++++++++
 engine/controllers/server.h                   |  6 +++
 engine/cortex-common/python_enginei.h         |  7 +--
 .../extensions/python-engine/python_engine.cc | 29 ++++++++---
 .../extensions/python-engine/python_engine.h  |  6 +--
 engine/services/inference_service.cc          | 21 ++------
 engine/services/inference_service.h           |  4 +-
 7 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
index a8cff2166..374ca40b8 100644
--- a/engine/controllers/server.cc
+++ b/engine/controllers/server.cc
@@ -210,6 +210,56 @@ void server::RouteRequest(
   }
 }
 
+void server::Python(
+  const HttpRequestPtr& req,
+  std::function<void(const HttpResponsePtr&)>&& callback,
+  const std::string& model) {
+
+  const std::string& full_path = req->getPath();
+
+  const std::string prefix = "/v1/python/";
+  if (full_path.substr(0, prefix.size()) != prefix) {
+    auto resp = cortex_utils::CreateCortexHttpJsonResponse(
+      Json::Value("Invalid path: must start with " + prefix));
+    resp->setStatusCode(k400BadRequest);
+    callback(resp);
+    return;
+  }
+
+  // convert /v1/python/{model}/remaining/path -> /remaning/path
+  const std::string path = full_path.substr(prefix.size() + model.size());
+
+  auto port_result = inference_svc_->GetPythonPort(model);
+  if (port_result.has_error()) {
+    auto resp = cortex_utils::CreateCortexHttpJsonResponse(
+      Json::Value(port_result.error()));
+    resp->setStatusCode(k400BadRequest);
+    callback(resp);
+    return;
+  }
+
+  // route request. localhost might not work?
+  const int port = port_result.value();
+  const std::string host = "http://127.0.0.1:" + std::to_string(port);
+  auto client = HttpClient::newHttpClient(host);
+
+  auto new_req = HttpRequest::newHttpRequest();
+  new_req->setMethod(req->method());
+  new_req->setPath(path);
+  new_req->setBody(std::string{req->body()});
+  new_req->setContentTypeCode(req->getContentType());
+
+  for (const auto& [field, value] : req->headers()) {
+    new_req->addHeader(field, value);
+  }
+
+  CTL_INF("Route request to " << host << path);
+  auto cb = [callback](ReqResult result, const HttpResponsePtr& response) {
+    callback(response);
+  };
+  client->sendRequest(new_req, cb);
+}
+
 void server::LoadModel(const HttpRequestPtr& req,
                        std::function<void(const HttpResponsePtr&)>&& callback) {
   auto ir = inference_svc_->LoadModel(req->getJsonObject());
diff --git a/engine/controllers/server.h b/engine/controllers/server.h
index 42214a641..99b545d0b 100644
--- a/engine/controllers/server.h
+++ b/engine/controllers/server.h
@@ -49,6 +49,8 @@ class server : public drogon::HttpController<server, false>,
   ADD_METHOD_TO(server::Inference, "/v1/inference", Options, Post);
   ADD_METHOD_TO(server::RouteRequest, "/v1/route/request", Options, Post);
 
+  ADD_METHOD_TO(server::Python, "/v1/python/{1}/.*", Options, Get, Post);
+
   METHOD_LIST_END
 
   void ChatCompletion(
@@ -76,6 +78,10 @@ class server : public drogon::HttpController<server, false>,
                  std::function<void(const HttpResponsePtr&)>&& callback);
   void RouteRequest(const HttpRequestPtr& req,
                     std::function<void(const HttpResponsePtr&)>&& callback);
+  void Python(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback,
+      const std::string& model);
 
  private:
   void ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h
index 481b6b146..b0a02d8cc 100644
--- a/engine/cortex-common/python_enginei.h
+++ b/engine/cortex-common/python_enginei.h
@@ -4,6 +4,7 @@
 #include <memory>
 
 #include "json/value.h"
+#include "utils/result.hpp"
 
 class PythonEngineI {
  public:
@@ -25,9 +26,5 @@ class PythonEngineI {
     std::shared_ptr<Json::Value> jsonBody,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-  virtual void HandleRequest(
-      const std::string& model,
-      const std::vector<std::string>& path_parts,
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual cpp::result<int, std::string> GetPort(const std::string& model) = 0;
 };
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 113b68bf7..396483013 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -302,14 +302,29 @@ void PythonEngine::GetModels(
   callback(std::move(status), std::move(res));
 }
 
-void PythonEngine::HandleRequest(
-  const std::string& model,
-  const std::vector<std::string>& path_parts,
-  std::shared_ptr<Json::Value> json_body,
-  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+cpp::result<int, std::string> PythonEngine::GetPort(const std::string& model) {
+  int port;
+
+  // check if model has started
+  {
+    std::shared_lock read_lock(mutex);
+    if (model_process_map.find(model) == model_process_map.end()) {
+      return cpp::fail("Model " + model + " has not been loaded yet.");
+    }
+    port = model_process_map[model].port;
+  }
+
+  // check if subprocess is still alive
+  {
+    std::unique_lock write_lock(mutex);
+    if (!model_process_map[model].IsAlive()) {
+      // NOTE: do we need to do any other cleanup for subprocesses?
+      model_process_map.erase(model);
+      return cpp::fail("Model " + model + " stopped running.");
+    }
+  }
 
-  assert(false && "Not implemented");
-  // get port
+  return port;
 }
 
 }  // namespace python_engine
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 71fd170e7..a79b3cedc 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -71,10 +71,6 @@ class PythonEngine : public PythonEngineI {
     std::shared_ptr<Json::Value> jsonBody,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 
-  void HandleRequest(
-    const std::string& model,
-    const std::vector<std::string>& path_parts,
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  cpp::result<int, std::string> GetPort(const std::string& model) override;
 };
 }  // namespace python_engine
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index b33374ad3..431df8941 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -199,28 +199,13 @@ cpp::result<void, InferResult> InferenceService::HandleRouteRequest(
   return {};
 }
 
-InferResult InferenceService::HandlePython(
-  const std::string& model, const std::vector<std::string>& path_parts,
-  std::shared_ptr<Json::Value> json_body) {
-
-  Json::Value stt, res;
-
+cpp::result<int, std::string> InferenceService::GetPythonPort(const std::string& model) {
   auto engine_result = engine_service_->GetLoadedEngine(kPythonEngine);
   if (engine_result.has_error()) {
-    res["message"] = "Python engine is not loaded yet";
-    stt["status_code"] = drogon::k400BadRequest;
-    LOG_WARN << "Python engine is not loaded yet";
-    return std::make_pair(stt, res);
+    return cpp::fail("Python engine is not loaded yet");
   }
 
-  auto cb = [&stt, &res](Json::Value s, Json::Value r) {
-    stt = s;
-    res = r;
-  };
-  std::get<PythonEngineI*>(engine_result.value())
-      ->HandleRequest(model, path_parts, json_body, cb);
-
-  return std::make_pair(stt, res);
+  return std::get<PythonEngineI*>(engine_result.value())->GetPort(model);
 }
 
 InferResult InferenceService::LoadModel(
diff --git a/engine/services/inference_service.h b/engine/services/inference_service.h
index 874ce8c85..e71fbc7e7 100644
--- a/engine/services/inference_service.h
+++ b/engine/services/inference_service.h
@@ -48,9 +48,7 @@ class InferenceService {
   cpp::result<void, InferResult> HandleRouteRequest(
       std::shared_ptr<SyncQueue> q, std::shared_ptr<Json::Value> json_body);
 
-  InferResult HandlePython(
-    const std::string& model, const std::vector<std::string>& path_parts,
-    std::shared_ptr<Json::Value> json_body);
+  cpp::result<int, std::string> GetPythonPort(const std::string& model);
 
   InferResult LoadModel(std::shared_ptr<Json::Value> json_body);
 

From 607d2cbb8d873a19e22a4eb8ed447ece6a90c69d Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 17:58:40 +0800
Subject: [PATCH 15/73] call PythonEngine destructor

---
 engine/extensions/python-engine/python_engine.cc | 8 ++++++--
 engine/extensions/python-engine/python_engine.h  | 4 ----
 engine/services/engine_service.cc                | 2 ++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 396483013..65b4d53c1 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -76,10 +76,14 @@ bool PythonEngine::PythonSubprocess::Kill() {
   return cortex::process::KillProcess(pid);
 }
 
-PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {}
+PythonEngine::PythonEngine() {}
 
 PythonEngine::~PythonEngine() {
-  curl_global_cleanup();
+  // NOTE: what happens if we can't kill subprocess?
+  std::unique_lock write_lock(mutex);
+  for (auto& [model_name, py_proc] : model_process_map) {
+    if (py_proc.IsAlive()) py_proc.Kill();
+  }
 }
 
 static std::pair<Json::Value, Json::Value> CreateResponse(
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index a79b3cedc..0da0c8412 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -39,9 +39,6 @@ bool IsUvInstalled();
 
 class PythonEngine : public PythonEngineI {
  private:
-  // extensions::TemplateRenderer renderer_;
-  // std::unique_ptr<trantor::FileLogger> async_file_logger_;
-
   struct PythonSubprocess {
     pid_t pid;
     int port;
@@ -52,7 +49,6 @@ class PythonEngine : public PythonEngineI {
 
   mutable std::shared_mutex mutex;
   std::unordered_map<std::string, PythonSubprocess> model_process_map;
-  trantor::ConcurrentTaskQueue q_;
 
  public:
   PythonEngine();
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 247e195be..9666c93ad 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -884,6 +884,8 @@ cpp::result<void, std::string> EngineService::UnloadEngine(
     auto unload_opts = EngineI::EngineUnloadOption{};
     e->Unload(unload_opts);
     delete e;
+  } else if (std::holds_alternative<PythonEngineI*>(engines_[ne].engine)) {
+    delete std::get<PythonEngineI*>(engines_[ne].engine);
   } else {
     delete std::get<RemoteEngineI*>(engines_[ne].engine);
   }

From f58b77327e91d39b2b93186ec046b5fee7316c05 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 18:06:57 +0800
Subject: [PATCH 16/73] remove unused method

---
 engine/services/engine_service.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h
index a8d5415a0..18631c279 100644
--- a/engine/services/engine_service.h
+++ b/engine/services/engine_service.h
@@ -168,9 +168,6 @@ class EngineService : public EngineServiceI {
       const std::string& version = "latest",
       const std::optional<std::string> variant_name = std::nullopt);
 
-  cpp::result<void, std::string> DownloadPythonUv(
-      const std::string& version = "latest");
-
   cpp::result<bool, std::string> DownloadCuda(const std::string& engine,
                                               bool async = false);
 

From bf23c9f01070ce0dbe1c7ccecacf6b9a7be8db33 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Feb 2025 19:44:14 +0800
Subject: [PATCH 17/73] remove unnecessary headers

---
 .../extensions/python-engine/python_engine.cc |  8 +++----
 .../extensions/python-engine/python_engine.h  | 22 -------------------
 2 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 65b4d53c1..f61414bca 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -1,8 +1,9 @@
 #include "python_engine.h"
 #include <filesystem>
-#include <iostream>
-#include <sstream>
-#include <string>
+
+#include "config/model_config.h"
+#include "utils/file_manager_utils.h"
+#include "utils/process/utils.h"
 
 namespace python_engine {
 namespace {
@@ -10,7 +11,6 @@ constexpr const int k200OK = 200;
 constexpr const int k400BadRequest = 400;
 constexpr const int k409Conflict = 409;
 constexpr const int k500InternalServerError = 500;
-constexpr const int kFileLoggerOption = 0;
 }  // namespace
 
 cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service) {
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 0da0c8412..988ccf9a1 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -1,36 +1,14 @@
 #pragma once
 
-#include <curl/curl.h>
 #include <json/json.h>
-#include <yaml-cpp/yaml.h>
-#include <mutex>
 #include <shared_mutex>
 #include <string>
 #include <unordered_map>
-#include "config/model_config.h"
-#include "trantor/utils/ConcurrentTaskQueue.h"
 
 #include "cortex-common/python_enginei.h"
-#include "extensions/template_renderer.h"
-#include "utils/file_logger.h"
-#include "utils/file_manager_utils.h"
-#include "utils/process_status_utils.h"
-#include "utils/curl_utils.h"
-#include "utils/process/utils.h"
 #include "services/download_service.h"
 
-// Helper for CURL response
 namespace python_engine {
-struct StreamContext {
-  std::shared_ptr<std::function<void(Json::Value&&, Json::Value&&)>> callback;
-  std::string buffer;
-};
-
-struct CurlResponse {
-  std::string body;
-  bool error{false};
-  std::string error_message;
-};
 
 // UV-related functions
 cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service);

From 8ebee7cf9bbeeda224a0fdd50eb4946a38f252b0 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Feb 2025 08:31:08 +0800
Subject: [PATCH 18/73] remove unused stuff

---
 engine/cortex-common/python_enginei.h |  3 ---
 engine/services/inference_service.cc  |  2 +-
 engine/services/model_service.cc      | 18 +++++++-----------
 engine/utils/config_yaml_utils.h      |  3 +--
 engine/utils/engine_constants.h       |  3 ---
 5 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h
index b0a02d8cc..ffde3d41b 100644
--- a/engine/cortex-common/python_enginei.h
+++ b/engine/cortex-common/python_enginei.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <functional>
-#include <memory>
 
 #include "json/value.h"
 #include "utils/result.hpp"
@@ -10,8 +9,6 @@ class PythonEngineI {
  public:
   virtual ~PythonEngineI() {}
 
-  // virtual bool IsSupported(const std::string& f) = 0;
-
   // model management
   virtual void LoadModel(
     std::shared_ptr<Json::Value> json_body,
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index 431df8941..2b241692a 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -375,7 +375,7 @@ InferResult InferenceService::GetModels(
 
 InferResult InferenceService::FineTuning(
     std::shared_ptr<Json::Value> json_body) {
-  std::string ne = kPythonRuntimeRepo;
+  std::string ne = kPythonEngine;
   Json::Value r;
   Json::Value stt;
 
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 9d02038b4..f0c6e760b 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -507,19 +507,15 @@ ModelService::DownloadModelFromCortexsoAsync(
     yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string());
     auto mc = yaml_handler.GetModelConfig();
 
-    if (mc.engine == kPythonEngine) {  // process for Python engine
+    mc.model = unique_model_id;
 
-    } else {
-      mc.model = unique_model_id;
-
-      uint64_t model_size = 0;
-      for (const auto& item : finishedTask.items) {
-        model_size = model_size + item.bytes.value_or(0);
-      }
-      mc.size = model_size;
-      yaml_handler.UpdateModelConfig(mc);
-      yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
+    uint64_t model_size = 0;
+    for (const auto& item : finishedTask.items) {
+      model_size = model_size + item.bytes.value_or(0);
     }
+    mc.size = model_size;
+    yaml_handler.UpdateModelConfig(mc);
+    yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
 
     auto rel =
         file_manager_utils::ToRelativeCortexDataPath(model_yml_item->localPath);
diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h
index 1749cd2d0..bb0f8f2d0 100644
--- a/engine/utils/config_yaml_utils.h
+++ b/engine/utils/config_yaml_utils.h
@@ -24,8 +24,7 @@ constexpr const auto kDefaultCorsEnabled = true;
 const std::vector<std::string> kDefaultEnabledOrigins{
     "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"};
 constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1";
-const std::vector<std::string> kDefaultSupportedEngines{kLlamaEngine,
-                                                        kPythonEngine};
+const std::vector<std::string> kDefaultSupportedEngines{kLlamaEngine};
 
 struct CortexConfig {
   std::string logFolderPath;
diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h
index 3cad230bc..10d19b160 100644
--- a/engine/utils/engine_constants.h
+++ b/engine/utils/engine_constants.h
@@ -12,10 +12,7 @@ constexpr const auto kLocal = "local";
 
 
 constexpr const auto kLlamaRepo = "cortex.llamacpp";
-constexpr const auto kPythonRuntimeRepo = "cortex.python";
-
 constexpr const auto kLlamaLibPath = "./engines/cortex.llamacpp";
-constexpr const auto kPythonRuntimeLibPath = "/engines/cortex.python";
 
 // other constants
 constexpr auto static kHuggingFaceHost = "huggingface.co";

From 8f36adcddb6ccf18216ddd1b015861854a14cabc Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Feb 2025 09:16:05 +0800
Subject: [PATCH 19/73] download uv directly from github release

---
 .../extensions/python-engine/python_engine.cc | 71 ++++++++++---------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index f61414bca..5805931bf 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -4,6 +4,9 @@
 #include "config/model_config.h"
 #include "utils/file_manager_utils.h"
 #include "utils/process/utils.h"
+#include "utils/system_info_utils.h"
+#include "utils/archive_utils.h"
+#include "utils/set_permission_utils.h"
 
 namespace python_engine {
 namespace {
@@ -14,46 +17,47 @@ constexpr const int k500InternalServerError = 500;
 }  // namespace
 
 cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service) {
-  const std::string py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
+  const auto py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
   std::filesystem::create_directories(py_bin_path);
 
-  const std::string uv_version = "0.5.31";
+  // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
+  const std::string uv_version = "0.6.2";
+
+  // build download url based on system info
+  std::stringstream fname_stream;
+  fname_stream << "uv-";
+
+  auto system_info = system_info_utils::GetSystemInfo();
+  if (system_info->arch == "amd64") fname_stream << "x86_64";
+  else if (system_info->arch == "arm64") fname_stream << "aarch64";
+
+  // NOTE: there is also a musl linux version
+  if (system_info->os == kMacOs) fname_stream << "-apple-darwin.tar.gz";
+  else if (system_info->os == kWindowsOs) fname_stream << "-pc-windows-msvc.zip";
+  else if (system_info->os == kLinuxOs) fname_stream << "-unknown-linux-gnu.tar.gz";
+
+  const std::string fname = fname_stream.str();
+  const std::string base_url = "https://github.com/astral-sh/uv/releases/download/";
+  const std::string url = (std::stringstream{} << base_url << uv_version << "/" << fname).str();
+  CTL_INF("Download uv from " << url);
 
-  // NOTE: only works on MacOS and Linux
   auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) {
     // try to unzip the downloaded file
-    const std::string installer_path = finishedTask.items[0].localPath.string();
-    CTL_INF("UV install script path: " << installer_path);
-    CTL_INF("Version: " << uv_version);
-
-    // https://docs.astral.sh/uv/configuration/installer/
-    // TODO: move env var mod logic to SpawnProcess()
-    // using env to set env vars
-    // should we download from here instead? https://github.com/astral-sh/uv/releases
-    std::vector<std::string> command{"env",
-                                     "UV_UNMANAGED_INSTALL=" + py_bin_path,
-                                     "sh",
-                                     installer_path,
-                                     "-q"};
-    const auto pid = cortex::process::SpawnProcess(command);
-    if (pid == -1) {
-      CTL_ERR("Failed to install uv");
-    }
-    // wait for subprocess to finish
-    // TODO: need to check return status if successful
-    waitpid(pid, NULL, 0);
-    std::filesystem::remove(installer_path);
+    const std::string download_path = finishedTask.items[0].localPath.string();
+
+    archive_utils::ExtractArchive(download_path, py_bin_path, true);
+    set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
+    std::filesystem::remove(download_path);
   };
 
-  const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh";
-  auto downloadTask =
-    DownloadTask{.id = "uv",
-                 .type = DownloadType::Engine,
-                 .items = {DownloadItem{
-                      .id = "uv",
-                      .downloadUrl = url,
-                      .localPath = py_bin_path + "/install.sh",
-                  }}};
+  auto downloadTask = DownloadTask{.id = "uv",
+                                   .type = DownloadType::Engine,
+                                   .items = {
+                                      DownloadItem{
+                                        .id = "uv",
+                                        .downloadUrl = url,
+                                        .localPath = py_bin_path / fname,
+                                      }}};
 
   auto add_task_result = download_service->AddTask(downloadTask, on_finished);
   if (add_task_result.has_error()) {
@@ -63,6 +67,7 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
 }
 
 std::string GetUvPath() {
+  // NOTE: do I need to add .exe for windows?
   return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
 }
 bool IsUvInstalled() {

From 5ebfbb73e85fb41d10e98b4baa3557ba0a82ecf0 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Feb 2025 17:42:34 +0800
Subject: [PATCH 20/73] check for entrypoint

---
 engine/extensions/python-engine/python_engine.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 5805931bf..dc4b52ef8 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -145,6 +145,10 @@ void PythonEngine::LoadModel(
     config::PythonModelConfig py_cfg;
     py_cfg.ReadFromYaml(model_dir / "model.yml");
 
+    if (py_cfg.entrypoint == "") {
+      throw std::runtime_error("Missing entrypoint in model.yml");
+    }
+
     // NOTE: model_dir / entrypoint assumes a Python script
     // TODO: figure out if we can support arbitrary CLI (but still launch by uv)
     std::vector<std::string> command{GetUvPath(), "run", model_dir / py_cfg.entrypoint};
@@ -158,7 +162,8 @@ void PythonEngine::LoadModel(
     if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush();
     if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush();
 
-    // TODO: what happens if the process starts, but exits?
+    // NOTE: process may start, but exits/crashes later
+    // TODO: wait for a few seconds, then check if process is alive
     pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path);
     if (pid == -1) {
       throw std::runtime_error("Fail to spawn process with pid -1");

From 5d310d121b7ecdfdf49debfd3d13255e2894db39 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Feb 2025 19:42:18 +0800
Subject: [PATCH 21/73] only record model size for llama.cpp

---
 engine/services/model_service.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index f0c6e760b..142933ff6 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -507,15 +507,17 @@ ModelService::DownloadModelFromCortexsoAsync(
     yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string());
     auto mc = yaml_handler.GetModelConfig();
 
-    mc.model = unique_model_id;
+    if (mc.engine == kLlamaEngine) {
+      mc.model = unique_model_id;
 
-    uint64_t model_size = 0;
-    for (const auto& item : finishedTask.items) {
-      model_size = model_size + item.bytes.value_or(0);
+      uint64_t model_size = 0;
+      for (const auto& item : finishedTask.items) {
+        model_size = model_size + item.bytes.value_or(0);
+      }
+      mc.size = model_size;
+      yaml_handler.UpdateModelConfig(mc);
+      yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
     }
-    mc.size = model_size;
-    yaml_handler.UpdateModelConfig(mc);
-    yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
 
     auto rel =
         file_manager_utils::ToRelativeCortexDataPath(model_yml_item->localPath);

From c4c622cb0f70c0bc86f86fbd938fcf2dabb7e4f0 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Feb 2025 20:17:08 +0800
Subject: [PATCH 22/73] don't include headers

---
 engine/controllers/server.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
index 374ca40b8..0b01d06e1 100644
--- a/engine/controllers/server.cc
+++ b/engine/controllers/server.cc
@@ -249,9 +249,10 @@ void server::Python(
   new_req->setBody(std::string{req->body()});
   new_req->setContentTypeCode(req->getContentType());
 
-  for (const auto& [field, value] : req->headers()) {
-    new_req->addHeader(field, value);
-  }
+  // including headers may make FastAPI reqject the request...
+  // for (const auto& [field, value] : req->headers()) {
+  //   new_req->addHeader(field, value);
+  // }
 
   CTL_INF("Route request to " << host << path);
   auto cb = [callback](ReqResult result, const HttpResponsePtr& response) {

From 6b59878399e6566741a8af107c02e931c4b2ec5c Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 09:23:36 +0800
Subject: [PATCH 23/73] don't use std::optional to support < c++17

---
 engine/utils/process/utils.cc | 20 +++++++++-----------
 engine/utils/process/utils.h  |  4 ++--
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 624b62262..da61661a6 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -40,8 +40,8 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args) {
 }
 
 pid_t SpawnProcess(const std::vector<std::string>& command,
-                   const std::optional<std::string> stdout_file,
-                   const std::optional<std::string> stderr_file) {
+                   const std::string stdout_file,
+                   const std::string stderr_file) {
   try {
 #if defined(_WIN32)
     // Windows process creation
@@ -90,25 +90,23 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
     // caller should make sure the redirect files exist.
     posix_spawn_file_actions_t *action_ptr = NULL;
 
-    if (stdout_file.has_value() || stderr_file.has_value()) {
+    if (!stdout_file.empty() || !stderr_file.empty()) {
       posix_spawn_file_actions_t action;
       posix_spawn_file_actions_init(&action);
       action_ptr = &action;
 
-      if (stdout_file.has_value()) {
-        std::string stdout_file_val = stdout_file.value();
-        if (std::filesystem::exists(stdout_file_val)) {
+      if (!stdout_file.empty()) {
+        if (std::filesystem::exists(stdout_file)) {
           posix_spawn_file_actions_addopen(&action, STDOUT_FILENO,
-                                           stdout_file_val.data(),
+                                           stdout_file.data(),
                                            O_WRONLY | O_APPEND, 0);
         }
       }
 
-      if (stderr_file.has_value()) {
-        std::string stderr_file_val = stderr_file.value();
-        if (std::filesystem::exists(stderr_file_val)) {
+      if (!stderr_file.empty()) {
+        if (std::filesystem::exists(stderr_file)) {
           posix_spawn_file_actions_addopen(&action, STDERR_FILENO,
-                                           stderr_file_val.data(),
+                                           stderr_file.data(),
                                            O_WRONLY | O_APPEND, 0);
         }
       }
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index 813d53750..d59e50103 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -21,8 +21,8 @@ std::string ConstructWindowsCommandLine(const std::vector<std::string>& args);
 std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 
 pid_t SpawnProcess(const std::vector<std::string>& command,
-                   const std::optional<std::string> stdout_file = {},
-                   const std::optional<std::string> stderr_file = {});
+                   const std::string stdout_file = "",
+                   const std::string stderr_file = "");
 bool IsProcessAlive(pid_t pid);
 bool KillProcess(pid_t pid);
 

From 250a2ac8682080dce9e57bf425b99a6f8baadc5b Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 09:24:55 +0800
Subject: [PATCH 24/73] fix stringstream usage

---
 engine/extensions/python-engine/python_engine.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index dc4b52ef8..b58a64eff 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -38,7 +38,10 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
 
   const std::string fname = fname_stream.str();
   const std::string base_url = "https://github.com/astral-sh/uv/releases/download/";
-  const std::string url = (std::stringstream{} << base_url << uv_version << "/" << fname).str();
+
+  std::stringstream url_stream;
+  url_stream << base_url << uv_version << "/" << fname;
+  const std::string url = url_stream.str();
   CTL_INF("Download uv from " << url);
 
   auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) {

From bb38a563c64491f480f8b09bc5fa812c4f1cb35f Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 09:43:39 +0800
Subject: [PATCH 25/73] define pid_t for windows

---
 engine/extensions/python-engine/python_engine.cc | 1 -
 engine/extensions/python-engine/python_engine.h  | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index b58a64eff..c380a45be 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -3,7 +3,6 @@
 
 #include "config/model_config.h"
 #include "utils/file_manager_utils.h"
-#include "utils/process/utils.h"
 #include "utils/system_info_utils.h"
 #include "utils/archive_utils.h"
 #include "utils/set_permission_utils.h"
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 988ccf9a1..c1b5ec0dd 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -7,6 +7,7 @@
 
 #include "cortex-common/python_enginei.h"
 #include "services/download_service.h"
+#include "utils/process/utils.h"
 
 namespace python_engine {
 

From 723c5db5646a23d2fc11d7f40e7a09ce41554ab7 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 10:09:04 +0800
Subject: [PATCH 26/73] explicit call .string() on filesystem::path to support
 windows

---
 engine/extensions/python-engine/python_engine.cc | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index c380a45be..a414f1cd5 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -47,7 +47,7 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
     // try to unzip the downloaded file
     const std::string download_path = finishedTask.items[0].localPath.string();
 
-    archive_utils::ExtractArchive(download_path, py_bin_path, true);
+    archive_utils::ExtractArchive(download_path, py_bin_path.string(), true);
     set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
     std::filesystem::remove(download_path);
   };
@@ -70,7 +70,9 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
 
 std::string GetUvPath() {
   // NOTE: do I need to add .exe for windows?
-  return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
+  const auto path = file_manager_utils::GetCortexDataPath()
+                    / "python_engine" / "bin" / "uv";
+  return path.string();
 }
 bool IsUvInstalled() {
   return std::filesystem::exists(GetUvPath());
@@ -145,7 +147,7 @@ void PythonEngine::LoadModel(
   pid_t pid;
   try {
     config::PythonModelConfig py_cfg;
-    py_cfg.ReadFromYaml(model_dir / "model.yml");
+    py_cfg.ReadFromYaml((model_dir / "model.yml").string());
 
     if (py_cfg.entrypoint == "") {
       throw std::runtime_error("Missing entrypoint in model.yml");
@@ -153,12 +155,13 @@ void PythonEngine::LoadModel(
 
     // NOTE: model_dir / entrypoint assumes a Python script
     // TODO: figure out if we can support arbitrary CLI (but still launch by uv)
-    std::vector<std::string> command{GetUvPath(), "run", model_dir / py_cfg.entrypoint};
+    const std::string entrypoint = (model_dir / py_cfg.entrypoint).string();
+    std::vector<std::string> command{GetUvPath(), "run", entrypoint};
     for (const auto& item : py_cfg.extra_args)
       command.push_back(item);
 
-    const std::string stdout_path = model_dir / "stdout.txt";
-    const std::string stderr_path = model_dir / "stderr.txt";
+    const std::string stdout_path = (model_dir / "stdout.txt").string();
+    const std::string stderr_path = (model_dir / "stderr.txt").string();
 
     // create empty stdout.txt and stderr.txt for redirection
     if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush();

From 26ec20a29182eaf482fe42df9216b3d3a7009270 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 10:45:40 +0800
Subject: [PATCH 27/73] include extra_args in entrypoint

---
 engine/config/model_config.h                  | 62 +++++++------------
 .../extensions/python-engine/python_engine.cc | 13 ++--
 2 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/engine/config/model_config.h b/engine/config/model_config.h
index 85335c37b..c7abf75f1 100644
--- a/engine/config/model_config.h
+++ b/engine/config/model_config.h
@@ -477,40 +477,33 @@ struct Endpoint {
 };
 
 struct PythonModelConfig {
-  // General Metadata
   std::string name;
   int version;
-
-  // Model Load Parameters
   std::string engine;
-  std::string entrypoint;
+
+  std::vector<std::string> entrypoint;
   int port;
-  std::vector<std::string> extra_args;
 
   // Method to convert C++ struct to YAML
   void ToYaml(const std::string& filepath) const {
     YAML::Emitter out;
     out << YAML::BeginMap;
 
-    // General Metadata
     out << YAML::Key << "name" << YAML::Value << name;
     out << YAML::Key << "version" << YAML::Value << version;
-
-    // Model Load Parameters
     out << YAML::Key << "engine" << YAML::Value << engine;
-    out << YAML::Key << "entrypoint" << YAML::Value << entrypoint;
-    out << YAML::Key << "port" << YAML::Value << port;
 
-    // Extra Arguments
-    if (!extra_args.empty()) {
-      out << YAML::Key << "extra_args" << YAML::Value << YAML::BeginSeq;
-      for (const auto& arg : extra_args) {
+    // entrypoint
+    if (!entrypoint.empty()) {
+      out << YAML::Key << "entrypoint" << YAML::Value << YAML::BeginSeq;
+      for (const auto& arg : entrypoint) {
         out << arg;
       }
       out << YAML::EndSeq;
     }
 
     out << YAML::EndMap;
+    out << YAML::Key << "port" << YAML::Value << port;
 
     // Write to file
     std::ofstream fout(filepath);
@@ -525,22 +518,18 @@ struct PythonModelConfig {
     try {
       YAML::Node config = YAML::LoadFile(filePath);
 
-      // General Metadata
       if (config["name"]) name = config["name"].as<std::string>();
       if (config["version"]) version = config["version"].as<int>();
-
-      // Model Load Parameters
       if (config["engine"]) engine = config["engine"].as<std::string>();
-      if (config["entrypoint"]) entrypoint = config["entrypoint"].as<std::string>();
-      if (config["port"]) port = config["port"].as<int>();
 
-      // Extra Arguments
-      if (config["extra_args"] && config["extra_args"].IsSequence()) {
-        extra_args.clear();
-        for (const auto& arg : config["extra_args"]) {
-          extra_args.push_back(arg.as<std::string>());
+      // entrypoint
+      if (config["entrypoint"] && config["entrypoint"].IsSequence()) {
+        entrypoint.clear();
+        for (const auto& arg : config["entrypoint"]) {
+          entrypoint.push_back(arg.as<std::string>());
         }
       }
+      if (config["port"]) port = config["port"].as<int>();
     }
     catch (const YAML::Exception& e) {
       throw std::runtime_error("Error parsing YAML file: " + std::string(e.what()));
@@ -554,21 +543,19 @@ struct PythonModelConfig {
   Json::Value ToJson() const {
     Json::Value json;
 
-    // Add basic string fields
     json["name"] = name;
     json["version"] = version;
     json["engine"] = engine;
-    json["entrypoint"] = entrypoint;
-    json["port"] = port;
 
-    // Add extra_args array
-    if (!extra_args.empty()) {
+    // entrypoint
+    if (!entrypoint.empty()) {
         Json::Value args(Json::arrayValue);
-        for (const auto& arg : extra_args) {
+        for (const auto& arg : entrypoint) {
             args.append(arg);
         }
-        json["extra_args"] = args;
+        json["entrypoint"] = args;
     }
+    json["port"] = port;
 
     return json;
   }
@@ -579,21 +566,20 @@ struct PythonModelConfig {
       throw std::runtime_error("Input JSON must be an object");
     }
     try {
-      // Basic fields
       name = root.get("name", name).asString();
       version = root.get("version", version).asInt();
       engine = root.get("engine", engine).asString();
-      entrypoint = root.get("entrypoint", entrypoint).asString();
-      port = root.get("port", port).asInt();
 
-      // Extra args array
-      extra_args.clear();
-      const Json::Value& args = root["extra_args"];
+      // entrypoint
+      entrypoint.clear();
+      const Json::Value& args = root["entrypoint"];
       if (args.isArray()) {
         for (const auto& arg : args) {
-          extra_args.push_back(arg.asString());
+          entrypoint.push_back(arg.asString());
         }
       }
+      port = root.get("port", port).asInt();
+
     } catch (const Json::Exception& e) {
       throw std::runtime_error("Error parsing JSON: " + std::string(e.what()));
     } catch (const std::exception& e) {
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index a414f1cd5..ffe873b71 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -149,15 +149,16 @@ void PythonEngine::LoadModel(
     config::PythonModelConfig py_cfg;
     py_cfg.ReadFromYaml((model_dir / "model.yml").string());
 
-    if (py_cfg.entrypoint == "") {
+    if (py_cfg.entrypoint.empty()) {
       throw std::runtime_error("Missing entrypoint in model.yml");
     }
 
-    // NOTE: model_dir / entrypoint assumes a Python script
-    // TODO: figure out if we can support arbitrary CLI (but still launch by uv)
-    const std::string entrypoint = (model_dir / py_cfg.entrypoint).string();
-    std::vector<std::string> command{GetUvPath(), "run", entrypoint};
-    for (const auto& item : py_cfg.extra_args)
+    // https://docs.astral.sh/uv/reference/cli/#uv-run
+    std::vector<std::string> command{GetUvPath(),
+                                     "run",
+                                     "--directory",
+                                     model_dir.string()};
+    for (const auto& item : py_cfg.entrypoint)
       command.push_back(item);
 
     const std::string stdout_path = (model_dir / "stdout.txt").string();

From 376deeb760599150e415cc4026978b440419983e Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 11:47:35 +0800
Subject: [PATCH 28/73] add python engine install test

---
 engine/e2e-test/test_api_engine.py               | 16 +++++++++++++---
 .../e2e-test/test_api_engine_install_nightly.py  |  4 ++++
 engine/e2e-test/test_cli_engine_install.py       | 11 +++++++++++
 .../e2e-test/test_cli_engine_install_nightly.py  | 11 +++++++++++
 4 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/engine/e2e-test/test_api_engine.py b/engine/e2e-test/test_api_engine.py
index e652e4495..e94b85f51 100644
--- a/engine/e2e-test/test_api_engine.py
+++ b/engine/e2e-test/test_api_engine.py
@@ -20,12 +20,12 @@ def setup_and_teardown(self):
 
         # Teardown
         stop_server()
-    
+
     # engines get
     def test_engines_get_llamacpp_should_be_successful(self):
         response = requests.get("http://localhost:3928/engines/llama-cpp")
         assert response.status_code == 200
-        
+
     # engines install
     def test_engines_install_llamacpp_specific_version_and_variant(self):
         data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx-cuda-11-7"}
@@ -40,7 +40,7 @@ def test_engines_install_llamacpp_specific_version_and_null_variant(self):
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
         assert response.status_code == 200
-    
+
     # engines uninstall
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_should_be_successful(self):
@@ -52,6 +52,16 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self):
         response = requests.delete("http://localhost:3928/v1/engines/llama-cpp/install")
         assert response.status_code == 200
 
+    @pytest.mark.asyncio
+    async def test_engines_install_uninstall_python_should_be_successful(self):
+        response = requests.post("http://localhost:3928/v1/engines/python/install")
+        assert response.status_code == 200
+        await wait_for_websocket_download_success_event(timeout=None)
+        time.sleep(30)
+
+        response = requests.delete("http://localhost:3928/v1/engines/python/install")
+        assert response.status_code == 200
+
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self):
         # install first
diff --git a/engine/e2e-test/test_api_engine_install_nightly.py b/engine/e2e-test/test_api_engine_install_nightly.py
index de4914c28..4f13d95c8 100644
--- a/engine/e2e-test/test_api_engine_install_nightly.py
+++ b/engine/e2e-test/test_api_engine_install_nightly.py
@@ -22,6 +22,10 @@ def test_engines_install_llamacpp_should_be_successful(self):
         response = requests.post("http://localhost:3928/v1/engines/llama-cpp/install")
         assert response.status_code == 200
 
+    def test_engines_install_python_should_be_successful(self):
+        response = requests.post("http://localhost:3928/v1/engines/python/install")
+        assert response.status_code == 200
+
     def test_engines_install_llamacpp_specific_version_and_variant(self):
         data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx-cuda-11-7"}
         response = requests.post(
diff --git a/engine/e2e-test/test_cli_engine_install.py b/engine/e2e-test/test_cli_engine_install.py
index aeeabd64d..2a23c8866 100644
--- a/engine/e2e-test/test_cli_engine_install.py
+++ b/engine/e2e-test/test_cli_engine_install.py
@@ -31,6 +31,17 @@ def test_engines_install_llamacpp_should_be_successfully(self):
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
+    def test_engines_install_python_should_be_successfully(self):
+        exit_code, output, error = run(
+            "Install Engine",
+            ["engines", "install", "python"],
+            timeout=None,
+            capture=False,
+        )
+        response = requests.get("http://127.0.0.1:3928/v1/engines/python")
+        assert len(response.json()) > 0
+        assert exit_code == 0, f"Install engine failed with error: {error}"
+
     @pytest.mark.skipif(reason="Ignore onnx-runtime test")
     def test_engines_install_onnx_on_macos_should_be_failed(self):
         exit_code, output, error = run(
diff --git a/engine/e2e-test/test_cli_engine_install_nightly.py b/engine/e2e-test/test_cli_engine_install_nightly.py
index 80490ab55..09af3ab2f 100644
--- a/engine/e2e-test/test_cli_engine_install_nightly.py
+++ b/engine/e2e-test/test_cli_engine_install_nightly.py
@@ -31,6 +31,17 @@ def test_engines_install_llamacpp_should_be_successfully(self):
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
+    def test_engines_install_python_should_be_successfully(self):
+        exit_code, output, error = run(
+            "Install Engine",
+            ["engines", "install", "python"],
+            timeout=None,
+            capture=False,
+        )
+        response = requests.get("http://127.0.0.1:3928/v1/engines/python")
+        assert len(response.json()) > 0
+        assert exit_code == 0, f"Install engine failed with error: {error}"
+
     @pytest.mark.skipif(reason="Ignore onnx-runtime test")
     def test_engines_install_onnx_on_macos_should_be_failed(self):
         exit_code, output, error = run(

From a9ed820cdf01dda8746767196374d61bfe991b9c Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 12:00:52 +0800
Subject: [PATCH 29/73] add start time

---
 engine/extensions/python-engine/python_engine.cc | 8 +++++++-
 engine/extensions/python-engine/python_engine.h  | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index ffe873b71..eb09eec98 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -174,8 +174,10 @@ void PythonEngine::LoadModel(
     if (pid == -1) {
       throw std::runtime_error("Fail to spawn process with pid -1");
     }
+    const uint64_t start_time = std::chrono::system_clock::now().time_since_epoch() /
+                                std::chrono::milliseconds(1);
     std::unique_lock write_lock(mutex);
-    model_process_map[model] = {pid, py_cfg.port};
+    model_process_map[model] = {pid, py_cfg.port, start_time};
 
   } catch (const std::exception& e) {
     auto e_msg = e.what();
@@ -305,8 +307,12 @@ void PythonEngine::GetModels(
       Json::Value val;
       val["id"] = model_name;
       val["engine"] = kPythonEngine;
+      val["start_time"] = py_proc.start_time;
       val["port"] = py_proc.port;
       val["object"] = "model";
+      // TODO
+      // val["ram"];
+      // val["vram"];
       model_list.append(val);
     }
   }
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index c1b5ec0dd..6189da05f 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -21,6 +21,7 @@ class PythonEngine : public PythonEngineI {
   struct PythonSubprocess {
     pid_t pid;
     int port;
+    uint64_t start_time;
 
     bool IsAlive();
     bool Kill();

From db8213438ae04c2a79a87943ff3a09ef693ddbf8 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Feb 2025 16:26:24 +0800
Subject: [PATCH 30/73] add back python engine to default supported engine so
 that cortex engines install work

---
 engine/utils/config_yaml_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h
index bb0f8f2d0..502fdc166 100644
--- a/engine/utils/config_yaml_utils.h
+++ b/engine/utils/config_yaml_utils.h
@@ -24,7 +24,7 @@ constexpr const auto kDefaultCorsEnabled = true;
 const std::vector<std::string> kDefaultEnabledOrigins{
     "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"};
 constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1";
-const std::vector<std::string> kDefaultSupportedEngines{kLlamaEngine};
+const std::vector<std::string> kDefaultSupportedEngines{kLlamaEngine, kPythonEngine};
 
 struct CortexConfig {
   std::string logFolderPath;

From 79464a230e25c40bad726846ed50a16e56bf8f2b Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 25 Feb 2025 12:18:50 +0800
Subject: [PATCH 31/73] format

---
 engine/config/model_config.h                  | 37 ++++----
 engine/controllers/server.cc                  | 11 +--
 engine/controllers/server.h                   |  7 +-
 engine/cortex-common/python_enginei.h         | 16 ++--
 .../extensions/python-engine/python_engine.cc | 94 ++++++++++---------
 .../extensions/python-engine/python_engine.h  | 19 ++--
 engine/services/engine_service.cc             |  7 +-
 engine/services/engine_service.h              |  1 -
 engine/services/inference_service.cc          | 37 ++++----
 engine/services/model_service.cc              |  6 +-
 engine/utils/process/utils.cc                 |  6 +-
 engine/utils/process/utils.h                  |  4 +-
 12 files changed, 128 insertions(+), 117 deletions(-)

diff --git a/engine/config/model_config.h b/engine/config/model_config.h
index c7abf75f1..80a5f5df8 100644
--- a/engine/config/model_config.h
+++ b/engine/config/model_config.h
@@ -518,9 +518,12 @@ struct PythonModelConfig {
     try {
       YAML::Node config = YAML::LoadFile(filePath);
 
-      if (config["name"]) name = config["name"].as<std::string>();
-      if (config["version"]) version = config["version"].as<int>();
-      if (config["engine"]) engine = config["engine"].as<std::string>();
+      if (config["name"])
+        name = config["name"].as<std::string>();
+      if (config["version"])
+        version = config["version"].as<int>();
+      if (config["engine"])
+        engine = config["engine"].as<std::string>();
 
       // entrypoint
       if (config["entrypoint"] && config["entrypoint"].IsSequence()) {
@@ -529,13 +532,14 @@ struct PythonModelConfig {
           entrypoint.push_back(arg.as<std::string>());
         }
       }
-      if (config["port"]) port = config["port"].as<int>();
-    }
-    catch (const YAML::Exception& e) {
-      throw std::runtime_error("Error parsing YAML file: " + std::string(e.what()));
-    }
-    catch (const std::exception& e) {
-      throw std::runtime_error("Error reading YAML file: " + std::string(e.what()));
+      if (config["port"])
+        port = config["port"].as<int>();
+    } catch (const YAML::Exception& e) {
+      throw std::runtime_error("Error parsing YAML file: " +
+                               std::string(e.what()));
+    } catch (const std::exception& e) {
+      throw std::runtime_error("Error reading YAML file: " +
+                               std::string(e.what()));
     }
   }
 
@@ -549,11 +553,11 @@ struct PythonModelConfig {
 
     // entrypoint
     if (!entrypoint.empty()) {
-        Json::Value args(Json::arrayValue);
-        for (const auto& arg : entrypoint) {
-            args.append(arg);
-        }
-        json["entrypoint"] = args;
+      Json::Value args(Json::arrayValue);
+      for (const auto& arg : entrypoint) {
+        args.append(arg);
+      }
+      json["entrypoint"] = args;
     }
     json["port"] = port;
 
@@ -583,7 +587,8 @@ struct PythonModelConfig {
     } catch (const Json::Exception& e) {
       throw std::runtime_error("Error parsing JSON: " + std::string(e.what()));
     } catch (const std::exception& e) {
-      throw std::runtime_error("Error processing JSON data: " + std::string(e.what()));
+      throw std::runtime_error("Error processing JSON data: " +
+                               std::string(e.what()));
     }
   }
 };
diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
index 0b01d06e1..ebc8639de 100644
--- a/engine/controllers/server.cc
+++ b/engine/controllers/server.cc
@@ -210,17 +210,16 @@ void server::RouteRequest(
   }
 }
 
-void server::Python(
-  const HttpRequestPtr& req,
-  std::function<void(const HttpResponsePtr&)>&& callback,
-  const std::string& model) {
+void server::Python(const HttpRequestPtr& req,
+                    std::function<void(const HttpResponsePtr&)>&& callback,
+                    const std::string& model) {
 
   const std::string& full_path = req->getPath();
 
   const std::string prefix = "/v1/python/";
   if (full_path.substr(0, prefix.size()) != prefix) {
     auto resp = cortex_utils::CreateCortexHttpJsonResponse(
-      Json::Value("Invalid path: must start with " + prefix));
+        Json::Value("Invalid path: must start with " + prefix));
     resp->setStatusCode(k400BadRequest);
     callback(resp);
     return;
@@ -232,7 +231,7 @@ void server::Python(
   auto port_result = inference_svc_->GetPythonPort(model);
   if (port_result.has_error()) {
     auto resp = cortex_utils::CreateCortexHttpJsonResponse(
-      Json::Value(port_result.error()));
+        Json::Value(port_result.error()));
     resp->setStatusCode(k400BadRequest);
     callback(resp);
     return;
diff --git a/engine/controllers/server.h b/engine/controllers/server.h
index 99b545d0b..e0e083213 100644
--- a/engine/controllers/server.h
+++ b/engine/controllers/server.h
@@ -78,10 +78,9 @@ class server : public drogon::HttpController<server, false>,
                  std::function<void(const HttpResponsePtr&)>&& callback);
   void RouteRequest(const HttpRequestPtr& req,
                     std::function<void(const HttpResponsePtr&)>&& callback);
-  void Python(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback,
-      const std::string& model);
+  void Python(const HttpRequestPtr& req,
+              std::function<void(const HttpResponsePtr&)>&& callback,
+              const std::string& model);
 
  private:
   void ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h
index ffde3d41b..35470f008 100644
--- a/engine/cortex-common/python_enginei.h
+++ b/engine/cortex-common/python_enginei.h
@@ -11,17 +11,17 @@ class PythonEngineI {
 
   // model management
   virtual void LoadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void UnloadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void GetModelStatus(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void GetModels(
-    std::shared_ptr<Json::Value> jsonBody,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
   virtual cpp::result<int, std::string> GetPort(const std::string& model) = 0;
 };
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index eb09eec98..635e35a78 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -2,10 +2,10 @@
 #include <filesystem>
 
 #include "config/model_config.h"
-#include "utils/file_manager_utils.h"
-#include "utils/system_info_utils.h"
 #include "utils/archive_utils.h"
+#include "utils/file_manager_utils.h"
 #include "utils/set_permission_utils.h"
+#include "utils/system_info_utils.h"
 
 namespace python_engine {
 namespace {
@@ -15,8 +15,10 @@ constexpr const int k409Conflict = 409;
 constexpr const int k500InternalServerError = 500;
 }  // namespace
 
-cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service) {
-  const auto py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
+cpp::result<void, std::string> DownloadUv(
+    std::shared_ptr<DownloadService>& download_service) {
+  const auto py_bin_path =
+      file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
   std::filesystem::create_directories(py_bin_path);
 
   // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
@@ -27,23 +29,30 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
   fname_stream << "uv-";
 
   auto system_info = system_info_utils::GetSystemInfo();
-  if (system_info->arch == "amd64") fname_stream << "x86_64";
-  else if (system_info->arch == "arm64") fname_stream << "aarch64";
+  if (system_info->arch == "amd64")
+    fname_stream << "x86_64";
+  else if (system_info->arch == "arm64")
+    fname_stream << "aarch64";
 
   // NOTE: there is also a musl linux version
-  if (system_info->os == kMacOs) fname_stream << "-apple-darwin.tar.gz";
-  else if (system_info->os == kWindowsOs) fname_stream << "-pc-windows-msvc.zip";
-  else if (system_info->os == kLinuxOs) fname_stream << "-unknown-linux-gnu.tar.gz";
+  if (system_info->os == kMacOs)
+    fname_stream << "-apple-darwin.tar.gz";
+  else if (system_info->os == kWindowsOs)
+    fname_stream << "-pc-windows-msvc.zip";
+  else if (system_info->os == kLinuxOs)
+    fname_stream << "-unknown-linux-gnu.tar.gz";
 
   const std::string fname = fname_stream.str();
-  const std::string base_url = "https://github.com/astral-sh/uv/releases/download/";
+  const std::string base_url =
+      "https://github.com/astral-sh/uv/releases/download/";
 
   std::stringstream url_stream;
   url_stream << base_url << uv_version << "/" << fname;
   const std::string url = url_stream.str();
   CTL_INF("Download uv from " << url);
 
-  auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) {
+  auto on_finished = [py_bin_path,
+                      uv_version](const DownloadTask& finishedTask) {
     // try to unzip the downloaded file
     const std::string download_path = finishedTask.items[0].localPath.string();
 
@@ -54,12 +63,11 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
 
   auto downloadTask = DownloadTask{.id = "uv",
                                    .type = DownloadType::Engine,
-                                   .items = {
-                                      DownloadItem{
-                                        .id = "uv",
-                                        .downloadUrl = url,
-                                        .localPath = py_bin_path / fname,
-                                      }}};
+                                   .items = {DownloadItem{
+                                       .id = "uv",
+                                       .downloadUrl = url,
+                                       .localPath = py_bin_path / fname,
+                                   }}};
 
   auto add_task_result = download_service->AddTask(downloadTask, on_finished);
   if (add_task_result.has_error()) {
@@ -70,8 +78,8 @@ cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& down
 
 std::string GetUvPath() {
   // NOTE: do I need to add .exe for windows?
-  const auto path = file_manager_utils::GetCortexDataPath()
-                    / "python_engine" / "bin" / "uv";
+  const auto path =
+      file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
   return path.string();
 }
 bool IsUvInstalled() {
@@ -91,7 +99,8 @@ PythonEngine::~PythonEngine() {
   // NOTE: what happens if we can't kill subprocess?
   std::unique_lock write_lock(mutex);
   for (auto& [model_name, py_proc] : model_process_map) {
-    if (py_proc.IsAlive()) py_proc.Kill();
+    if (py_proc.IsAlive())
+      py_proc.Kill();
   }
 }
 
@@ -109,8 +118,7 @@ static std::pair<Json::Value, Json::Value> CreateResponse(
   if (has_error) {
     CTL_ERR(msg);
     res["error"] = msg;
-  }
-  else {
+  } else {
     res["status"] = msg;
   }
 
@@ -123,7 +131,7 @@ void PythonEngine::LoadModel(
 
   if (!json_body->isMember("model") || !json_body->isMember("model_dir")) {
     auto [status, error] = CreateResponse(
-      "Missing required fields: model or model_dir", k400BadRequest);
+        "Missing required fields: model or model_dir", k400BadRequest);
     callback(std::move(status), std::move(error));
     return;
   }
@@ -137,8 +145,8 @@ void PythonEngine::LoadModel(
   {
     std::shared_lock read_lock(mutex);
     if (model_process_map.find(model) != model_process_map.end()) {
-      auto [status, error] = CreateResponse(
-        "Model already loaded!", k409Conflict);
+      auto [status, error] =
+          CreateResponse("Model already loaded!", k409Conflict);
       callback(std::move(status), std::move(error));
       return;
     }
@@ -154,9 +162,7 @@ void PythonEngine::LoadModel(
     }
 
     // https://docs.astral.sh/uv/reference/cli/#uv-run
-    std::vector<std::string> command{GetUvPath(),
-                                     "run",
-                                     "--directory",
+    std::vector<std::string> command{GetUvPath(), "run", "--directory",
                                      model_dir.string()};
     for (const auto& item : py_cfg.entrypoint)
       command.push_back(item);
@@ -165,8 +171,10 @@ void PythonEngine::LoadModel(
     const std::string stderr_path = (model_dir / "stderr.txt").string();
 
     // create empty stdout.txt and stderr.txt for redirection
-    if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush();
-    if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush();
+    if (!std::filesystem::exists(stdout_path))
+      std::ofstream(stdout_path).flush();
+    if (!std::filesystem::exists(stderr_path))
+      std::ofstream(stderr_path).flush();
 
     // NOTE: process may start, but exits/crashes later
     // TODO: wait for a few seconds, then check if process is alive
@@ -174,8 +182,9 @@ void PythonEngine::LoadModel(
     if (pid == -1) {
       throw std::runtime_error("Fail to spawn process with pid -1");
     }
-    const uint64_t start_time = std::chrono::system_clock::now().time_since_epoch() /
-                                std::chrono::milliseconds(1);
+    const uint64_t start_time =
+        std::chrono::system_clock::now().time_since_epoch() /
+        std::chrono::milliseconds(1);
     std::unique_lock write_lock(mutex);
     model_process_map[model] = {pid, py_cfg.port, start_time};
 
@@ -187,17 +196,17 @@ void PythonEngine::LoadModel(
   }
 
   auto [status, res] = CreateResponse(
-    "Model loaded successfully with pid: " + std::to_string(pid),
-    k200OK);
+      "Model loaded successfully with pid: " + std::to_string(pid), k200OK);
   callback(std::move(status), std::move(res));
 }
 
 void PythonEngine::UnloadModel(
-  std::shared_ptr<Json::Value> json_body,
-  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
   if (!json_body->isMember("model")) {
-    auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest);
+    auto [status, error] =
+        CreateResponse("Missing required field: model", k400BadRequest);
     callback(std::move(status), std::move(error));
     return;
   }
@@ -248,11 +257,12 @@ void PythonEngine::UnloadModel(
 }
 
 void PythonEngine::GetModelStatus(
-  std::shared_ptr<Json::Value> json_body,
-  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
   if (!json_body->isMember("model")) {
-    auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest);
+    auto [status, error] =
+        CreateResponse("Missing required field: model", k400BadRequest);
     callback(std::move(status), std::move(error));
     return;
   }
@@ -296,8 +306,8 @@ void PythonEngine::GetModelStatus(
 }
 
 void PythonEngine::GetModels(
-  std::shared_ptr<Json::Value> jsonBody,
-  std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+    std::shared_ptr<Json::Value> jsonBody,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
   Json::Value res, model_list(Json::arrayValue), status;
   {
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 6189da05f..904c9aa63 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -12,7 +12,8 @@
 namespace python_engine {
 
 // UV-related functions
-cpp::result<void, std::string> DownloadUv(std::shared_ptr<DownloadService>& download_service);
+cpp::result<void, std::string> DownloadUv(
+    std::shared_ptr<DownloadService>& download_service);
 std::string GetUvPath();
 bool IsUvInstalled();
 
@@ -35,17 +36,17 @@ class PythonEngine : public PythonEngineI {
   ~PythonEngine();
 
   void LoadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
   void UnloadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
   void GetModelStatus(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
   void GetModels(
-    std::shared_ptr<Json::Value> jsonBody,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 
   cpp::result<int, std::string> GetPort(const std::string& model) override;
 };
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 9666c93ad..bdb5ffebb 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -237,8 +237,7 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
 }
 
 cpp::result<void, std::string> EngineService::DownloadLlamaCpp(
-  const std::string& version,
-  const std::optional<std::string> variant_name) {
+    const std::string& version, const std::optional<std::string> variant_name) {
 
   const std::string engine = kLlamaRepo;
   auto normalized_version = version == "latest"
@@ -930,7 +929,9 @@ cpp::result<bool, std::string> EngineService::IsEngineReady(
   // Check for python engine
   if (engine == kPythonEngine) {
     if (!python_engine::IsUvInstalled()) {
-      return cpp::fail("Python engine is not ready. Please run `cortex engines install python`");
+      return cpp::fail(
+          "Python engine is not ready. Please run `cortex engines install "
+          "python`");
     }
     return true;
   }
diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h
index 18631c279..b2cc1d7c4 100644
--- a/engine/services/engine_service.h
+++ b/engine/services/engine_service.h
@@ -132,7 +132,6 @@ class EngineService : public EngineServiceI {
   cpp::result<EngineUpdateResult, std::string> UpdateEngine(
       const std::string& engine);
 
- 
   cpp::result<std::vector<cortex::db::EngineEntry>, std::string> GetEngines();
 
   cpp::result<cortex::db::EngineEntry, std::string> GetEngineById(int id);
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index a899eb2cf..82a162a9f 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -144,8 +144,8 @@ cpp::result<void, InferResult> InferenceService::HandleEmbedding(
     std::get<EngineI*>(engine_result.value())
         ->HandleEmbedding(json_body, std::move(cb));
   } else if (std::holds_alternative<PythonEngineI*>(engine_result.value())) {
-    return cpp::fail(GetUnsupportedResponse(
-        "Python engine does not support Embedding"));
+    return cpp::fail(
+        GetUnsupportedResponse("Python engine does not support Embedding"));
   } else {
     std::get<RemoteEngineI*>(engine_result.value())
         ->HandleEmbedding(json_body, std::move(cb));
@@ -211,7 +211,8 @@ cpp::result<void, InferResult> InferenceService::HandleRouteRequest(
   return {};
 }
 
-cpp::result<int, std::string> InferenceService::GetPythonPort(const std::string& model) {
+cpp::result<int, std::string> InferenceService::GetPythonPort(
+    const std::string& model) {
   auto engine_result = engine_service_->GetLoadedEngine(kPythonEngine);
   if (engine_result.has_error()) {
     return cpp::fail("Python engine is not loaded yet");
@@ -249,14 +250,11 @@ InferResult InferenceService::LoadModel(
     r = res;
   };
   if (std::holds_alternative<EngineI*>(engine)) {
-    std::get<EngineI*>(engine)
-        ->LoadModel(json_body, std::move(cb));
+    std::get<EngineI*>(engine)->LoadModel(json_body, std::move(cb));
   } else if (std::holds_alternative<PythonEngineI*>(engine)) {
-    std::get<PythonEngineI*>(engine)
-        ->LoadModel(json_body, std::move(cb));
+    std::get<PythonEngineI*>(engine)->LoadModel(json_body, std::move(cb));
   } else {
-    std::get<RemoteEngineI*>(engine)
-        ->LoadModel(json_body, std::move(cb));
+    std::get<RemoteEngineI*>(engine)->LoadModel(json_body, std::move(cb));
   }
   // Save model config to reload if needed
   auto model_id = json_body->get("model", "").asString();
@@ -289,14 +287,14 @@ InferResult InferenceService::UnloadModel(const std::string& engine_name,
   };
   auto engine = engine_result.value();
   if (std::holds_alternative<EngineI*>(engine)) {
-    std::get<EngineI*>(engine)
-        ->UnloadModel(std::make_shared<Json::Value>(json_body), std::move(cb));
+    std::get<EngineI*>(engine)->UnloadModel(
+        std::make_shared<Json::Value>(json_body), std::move(cb));
   } else if (std::holds_alternative<PythonEngineI*>(engine)) {
-    std::get<PythonEngineI*>(engine)
-        ->UnloadModel(std::make_shared<Json::Value>(json_body), std::move(cb));
+    std::get<PythonEngineI*>(engine)->UnloadModel(
+        std::make_shared<Json::Value>(json_body), std::move(cb));
   } else {
-    std::get<RemoteEngineI*>(engine)
-        ->UnloadModel(std::make_shared<Json::Value>(json_body), std::move(cb));
+    std::get<RemoteEngineI*>(engine)->UnloadModel(
+        std::make_shared<Json::Value>(json_body), std::move(cb));
   }
 
   return std::make_pair(stt, r);
@@ -331,14 +329,11 @@ InferResult InferenceService::GetModelStatus(
   };
   auto engine = engine_result.value();
   if (std::holds_alternative<EngineI*>(engine)) {
-    std::get<EngineI*>(engine)
-        ->GetModelStatus(json_body, std::move(cb));
+    std::get<EngineI*>(engine)->GetModelStatus(json_body, std::move(cb));
   } else if (std::holds_alternative<PythonEngineI*>(engine)) {
-    std::get<PythonEngineI*>(engine)
-        ->GetModelStatus(json_body, std::move(cb));
+    std::get<PythonEngineI*>(engine)->GetModelStatus(json_body, std::move(cb));
   } else {
-    std::get<RemoteEngineI*>(engine)
-        ->GetModelStatus(json_body, std::move(cb));
+    std::get<RemoteEngineI*>(engine)->GetModelStatus(json_body, std::move(cb));
   }
 
   return std::make_pair(stt, r);
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 2f4317079..93a48e72c 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -761,11 +761,13 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 
       // Check if Python model first
       if (mc.engine == kPythonEngine) {
-        const std::string model_yaml_path = model_entry.value().path_to_model_yaml;
+        const std::string model_yaml_path =
+            model_entry.value().path_to_model_yaml;
 
         json_data["model"] = model_handle;
         json_data["model_dir"] = fmu::ToAbsoluteCortexDataPath(
-                                    fs::path(model_yaml_path).parent_path()).string();
+                                     fs::path(model_yaml_path).parent_path())
+                                     .string();
         json_data["engine"] = mc.engine;
         assert(!!inference_svc_);
 
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index fa48cdd7d..c0bd29458 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -1,11 +1,11 @@
 #include "utils/process/utils.h"
-#include "utils/logging_utils.h"
 #include <filesystem>
+#include "utils/logging_utils.h"
 
 #if defined(_WIN32)
 #include <tlhelp32.h>
 #elif defined(__APPLE__) || defined(__linux__)
-extern char **environ;  // environment variables
+extern char** environ;  // environment variables
 #include <errno.h>
 #include <fcntl.h>
 #endif
@@ -88,7 +88,7 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
 
     // redirect stdout and stderr
     // caller should make sure the redirect files exist.
-    posix_spawn_file_actions_t *action_ptr = NULL;
+    posix_spawn_file_actions_t* action_ptr = NULL;
 
     if (!stdout_file.empty() || !stderr_file.empty()) {
       posix_spawn_file_actions_t action;
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index d59e50103..f4220e4de 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -12,8 +12,8 @@ using pid_t = DWORD;
 #include <unistd.h>
 #endif
 
-#include <vector>
 #include <string>
+#include <vector>
 
 namespace cortex::process {
 std::string ConstructWindowsCommandLine(const std::vector<std::string>& args);
@@ -26,4 +26,4 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
 bool IsProcessAlive(pid_t pid);
 bool KillProcess(pid_t pid);
 
-}
+}  // namespace cortex::process

From 17688264e25f9ef5ef714018a482bf02ff9898d8 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 26 Feb 2025 09:53:09 +0800
Subject: [PATCH 32/73] run uv sync after model download

---
 .../extensions/python-engine/python_engine.cc | 63 ++++++++++++++++++-
 .../extensions/python-engine/python_engine.h  |  6 ++
 engine/services/model_service.cc              | 10 +++
 engine/utils/process/utils.cc                 | 11 +++-
 engine/utils/process/utils.h                  |  3 +-
 5 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 635e35a78..a4cf78ba8 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -59,6 +59,22 @@ cpp::result<void, std::string> DownloadUv(
     archive_utils::ExtractArchive(download_path, py_bin_path.string(), true);
     set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
     std::filesystem::remove(download_path);
+
+    // install Python3.10 from Astral. this will be preferred over system
+    // Python when possible.
+    // NOTE: currently this will install to a user-wide directory. we can
+    // install to a specific location using `--install-dir`, but later
+    // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use
+    // this Python installation.
+    // we can add this once we allow passing custom env var to SpawnProcess().
+    // https://docs.astral.sh/uv/reference/cli/#uv-python-install
+    std::vector<std::string> command = BuildUvCommand("python");
+    command.push_back("install");
+    command.push_back("3.10");
+
+    const pid_t pid = cortex::process::SpawnProcess(command, "", "", true);
+    if (pid == -1)
+      return cpp::fail("Fail to spawn process");
   };
 
   auto downloadTask = DownloadTask{.id = "uv",
@@ -82,10 +98,53 @@ std::string GetUvPath() {
       file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
   return path.string();
 }
+
+// use our own cache dir so that when users delete cortexcpp/, everything is deleted.
+std::string GetUvCacheDir() {
+  const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" /
+                    "cache" / "uv";
+  return path.string();
+}
+
+std::vector<std::string> BuildUvCommand(const std::string& action,
+                                        const std::string& directory) {
+  std::vector<std::string> command = {GetUvPath(), "--cache-dir",
+                                      GetUvCacheDir()};
+  if (!directory.empty()) {
+    command.push_back("--directory");
+    command.push_back(directory);
+  }
+  command.push_back(action);
+  return command;
+}
+
 bool IsUvInstalled() {
   return std::filesystem::exists(GetUvPath());
 }
 
+cpp::result<void, std::string> UvDownloadDeps(
+    const std::filesystem::path& model_dir) {
+  if (!IsUvInstalled())
+    return cpp::fail(
+        "uv is not installed. Please run `cortex engines install python`.");
+
+  std::vector<std::string> command = BuildUvCommand("sync", model_dir.string());
+
+  // script mode. 1st argument is path to .py script
+  if (!std::filesystem::exists(model_dir / "pyproject.toml")) {
+    config::PythonModelConfig py_cfg;
+    py_cfg.ReadFromYaml((model_dir / "model.yml").string());
+    command.push_back("--script");
+    command.push_back(py_cfg.entrypoint[0]);
+  }
+
+  const pid_t pid = cortex::process::SpawnProcess(command, "", "", true);
+  if (pid == -1)
+    return cpp::fail("Fail to install dependencies");
+
+  return {};
+}
+
 bool PythonEngine::PythonSubprocess::IsAlive() {
   return cortex::process::IsProcessAlive(pid);
 }
@@ -162,8 +221,8 @@ void PythonEngine::LoadModel(
     }
 
     // https://docs.astral.sh/uv/reference/cli/#uv-run
-    std::vector<std::string> command{GetUvPath(), "run", "--directory",
-                                     model_dir.string()};
+    std::vector<std::string> command =
+        BuildUvCommand("run", model_dir.string());
     for (const auto& item : py_cfg.entrypoint)
       command.push_back(item);
 
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 904c9aa63..02743ac22 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <json/json.h>
+#include <filesystem>
 #include <shared_mutex>
 #include <string>
 #include <unordered_map>
@@ -15,7 +16,12 @@ namespace python_engine {
 cpp::result<void, std::string> DownloadUv(
     std::shared_ptr<DownloadService>& download_service);
 std::string GetUvPath();
+std::string GetUvCacheDir();
+std::vector<std::string> BuildUvCommand(const std::string& action,
+                                        const std::string& directory = "");
 bool IsUvInstalled();
+cpp::result<void, std::string> UvDownloadDeps(
+    const std::filesystem::path& yaml_path);
 
 class PythonEngine : public PythonEngineI {
  private:
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 93a48e72c..a6c6037d8 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -14,6 +14,7 @@
 
 #include "services/inference_service.h"
 
+#include "extensions/python-engine/python_engine.h"
 #include "utils/cli_selection_utils.h"
 #include "utils/engine_constants.h"
 #include "utils/file_manager_utils.h"
@@ -507,6 +508,7 @@ ModelService::DownloadModelFromCortexsoAsync(
     yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string());
     auto mc = yaml_handler.GetModelConfig();
 
+    // post-download hooks for different engines
     if (mc.engine == kLlamaEngine) {
       mc.model = unique_model_id;
 
@@ -517,6 +519,14 @@ ModelService::DownloadModelFromCortexsoAsync(
       mc.size = model_size;
       yaml_handler.UpdateModelConfig(mc);
       yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
+
+    } else if (mc.engine == kPythonEngine) {
+      const auto model_dir = model_yml_item->localPath.parent_path();
+      auto result = python_engine::UvDownloadDeps(model_dir);
+      if (result.has_error()) {
+        CTL_ERR(result.error());
+        return;
+      }
     }
 
     auto rel =
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index c0bd29458..efc018ad5 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -41,7 +41,8 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args) {
 
 pid_t SpawnProcess(const std::vector<std::string>& command,
                    const std::string stdout_file,
-                   const std::string stderr_file) {
+                   const std::string stderr_file,
+                   bool wait) {
   try {
 #if defined(_WIN32)
     // Windows process creation
@@ -73,6 +74,10 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
     // Store the process ID
     pid_t pid = pi.dwProcessId;
 
+    // wait for process to terminate
+    if (wait)
+      WaitForSingleObject(pi.hProcess, INFINITE);
+
     // Close handles to avoid resource leaks
     CloseHandle(pi.hProcess);
     CloseHandle(pi.hThread);
@@ -128,6 +133,10 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
       throw std::runtime_error("Failed to spawn process");
     }
 
+    // wait for process to terminate
+    if (wait)
+      waitpid(pid, NULL, 0);
+
     return pid;
 
 #else
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index f4220e4de..f8719aa68 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -22,7 +22,8 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 
 pid_t SpawnProcess(const std::vector<std::string>& command,
                    const std::string stdout_file = "",
-                   const std::string stderr_file = "");
+                   const std::string stderr_file = "",
+                   bool wait = false);
 bool IsProcessAlive(pid_t pid);
 bool KillProcess(pid_t pid);
 

From 7627eac95612572cce15fd7ffe54d1a0de144c0f Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 26 Feb 2025 11:15:19 +0800
Subject: [PATCH 33/73] download CUDA for python engine

---
 engine/cli/commands/engine_install_cmd.cc       | 17 +++++++++++------
 .../extensions/python-engine/python_engine.cc   |  4 ++--
 engine/services/engine_service.cc               |  4 ++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc
index 85a5def5d..d3fdf8b9b 100644
--- a/engine/cli/commands/engine_install_cmd.cc
+++ b/engine/cli/commands/engine_install_cmd.cc
@@ -7,6 +7,13 @@
 #include "utils/string_utils.h"
 
 namespace commands {
+
+// NOTE: should have a single source of truth between CLI and server
+static bool NeedCudaDownload(const std::string& engine) {
+  return !system_info_utils::GetDriverAndCudaVersion().second.empty() &&
+         engine != kPythonEngine;
+}
+
 bool EngineInstallCmd::Exec(const std::string& engine,
                             const std::string& version,
                             const std::string& src) {
@@ -35,10 +42,9 @@ bool EngineInstallCmd::Exec(const std::string& engine,
   if (show_menu_) {
     DownloadProgress dp;
     dp.Connect(host_, port_);
+    bool need_cuda_download = NeedCudaDownload(engine);
     // engine can be small, so need to start ws first
-    auto dp_res = std::async(std::launch::deferred, [&dp] {
-      bool need_cuda_download =
-          !system_info_utils::GetDriverAndCudaVersion().second.empty();
+    auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] {
       if (need_cuda_download) {
         return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
       } else {
@@ -148,10 +154,9 @@ bool EngineInstallCmd::Exec(const std::string& engine,
   // default
   DownloadProgress dp;
   dp.Connect(host_, port_);
+  bool need_cuda_download = NeedCudaDownload(engine);
   // engine can be small, so need to start ws first
-  auto dp_res = std::async(std::launch::deferred, [&dp] {
-    bool need_cuda_download =
-        !system_info_utils::GetDriverAndCudaVersion().second.empty();
+  auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] {
     if (need_cuda_download) {
       return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
     } else {
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index a4cf78ba8..201211abf 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -77,10 +77,10 @@ cpp::result<void, std::string> DownloadUv(
       return cpp::fail("Fail to spawn process");
   };
 
-  auto downloadTask = DownloadTask{.id = "uv",
+  auto downloadTask = DownloadTask{.id = "python-uv",
                                    .type = DownloadType::Engine,
                                    .items = {DownloadItem{
-                                       .id = "uv",
+                                       .id = "python-uv",
                                        .downloadUrl = url,
                                        .localPath = py_bin_path / fname,
                                    }}};
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index bdb5ffebb..ac0c9eae9 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -377,8 +377,8 @@ cpp::result<void, std::string> EngineService::DownloadLlamaCpp(
 
 cpp::result<bool, std::string> EngineService::DownloadCuda(
     const std::string& engine, bool async) {
-  if (hw_inf_.sys_inf->os == "mac") {
-    // mac does not require cuda toolkit
+  if (hw_inf_.sys_inf->os == "mac" || engine == kPythonEngine) {
+    // mac and Python engine do not require cuda toolkit
     return true;
   }
 

From 06503c00b1ba9162021e334486e1ecb89173c0bc Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 26 Feb 2025 13:39:54 +0800
Subject: [PATCH 34/73] add .exe for windows

---
 engine/extensions/python-engine/python_engine.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 201211abf..1a7af51a3 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -93,9 +93,10 @@ cpp::result<void, std::string> DownloadUv(
 }
 
 std::string GetUvPath() {
-  // NOTE: do I need to add .exe for windows?
+  auto system_info = system_info_utils::GetSystemInfo();
+  const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
   const auto path =
-      file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv";
+      file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / bin_name;
   return path.string();
 }
 

From 176f8784238b5debde69dd3fa1e69520a14cb0f8 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 26 Feb 2025 16:49:01 +0800
Subject: [PATCH 35/73] destroy file action in posix

---
 engine/utils/process/utils.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index efc018ad5..fe336a476 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -126,8 +126,9 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
                                      environ  // environment (inherit)
     );
 
-    // NOTE: only destroy this when process ends?
-    // posix_spawn_file_actions_destroy(action_pr);
+    // NOTE: it seems like it's ok to destroy this immediately before
+    // subprocess terminates.
+    posix_spawn_file_actions_destroy(action_ptr);
 
     if (spawn_result != 0) {
       throw std::runtime_error("Failed to spawn process");

From f7bddc22da231c9c6b3de51791f154978167c302 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 27 Feb 2025 13:27:10 +0800
Subject: [PATCH 36/73] revert name change to avoid conflict

---
 engine/utils/engine_constants.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h
index 10d19b160..8eeaa1946 100644
--- a/engine/utils/engine_constants.h
+++ b/engine/utils/engine_constants.h
@@ -1,7 +1,7 @@
 #pragma once
 
 constexpr const auto kLlamaEngine = "llama-cpp";
-constexpr const auto kPythonEngine = "python";
+constexpr const auto kPythonEngine = "python-engine";
 
 constexpr const auto kOpenAiEngine = "openai";
 constexpr const auto kAnthropicEngine = "anthropic";

From 728e7eb02062920da4a49b07255ba2a34468a7e8 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 27 Feb 2025 15:12:40 +0800
Subject: [PATCH 37/73] check for NULL before destroy file action

---
 engine/utils/process/utils.cc | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index fe336a476..0ca5f8bfb 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -40,8 +40,7 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args) {
 }
 
 pid_t SpawnProcess(const std::vector<std::string>& command,
-                   const std::string stdout_file,
-                   const std::string stderr_file,
+                   const std::string stdout_file, const std::string stderr_file,
                    bool wait) {
   try {
 #if defined(_WIN32)
@@ -102,17 +101,25 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
 
       if (!stdout_file.empty()) {
         if (std::filesystem::exists(stdout_file)) {
-          posix_spawn_file_actions_addopen(&action, STDOUT_FILENO,
-                                           stdout_file.data(),
-                                           O_WRONLY | O_APPEND, 0);
+          int rc = posix_spawn_file_actions_addopen(&action, STDOUT_FILENO,
+                                                    stdout_file.data(),
+                                                    O_WRONLY | O_APPEND, 0);
+          if (rc != 0) {
+            posix_spawn_file_actions_destroy(action_ptr);
+            throw std::runtime_error("Unable to add stdout to file action");
+          }
         }
       }
 
       if (!stderr_file.empty()) {
         if (std::filesystem::exists(stderr_file)) {
-          posix_spawn_file_actions_addopen(&action, STDERR_FILENO,
-                                           stderr_file.data(),
-                                           O_WRONLY | O_APPEND, 0);
+          int rc = posix_spawn_file_actions_addopen(&action, STDERR_FILENO,
+                                                    stderr_file.data(),
+                                                    O_WRONLY | O_APPEND, 0);
+          if (rc != 0) {
+            posix_spawn_file_actions_destroy(action_ptr);
+            throw std::runtime_error("Unable to add stderr to file action");
+          }
         }
       }
     }
@@ -128,7 +135,9 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
 
     // NOTE: it seems like it's ok to destroy this immediately before
     // subprocess terminates.
-    posix_spawn_file_actions_destroy(action_ptr);
+    if (action_ptr != NULL) {
+      posix_spawn_file_actions_destroy(action_ptr);
+    }
 
     if (spawn_result != 0) {
       throw std::runtime_error("Failed to spawn process");

From 560b9fe025e6513c4a3134946a272e103c7fbd20 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 27 Feb 2025 16:51:04 +0800
Subject: [PATCH 38/73] fix windows

---
 engine/cli/commands/server_start_cmd.cc       |   6 +-
 .../extensions/python-engine/python_engine.cc |  53 +++--
 .../extensions/python-engine/python_engine.h  |   2 +-
 engine/services/hardware_service.cc           |   6 +-
 engine/utils/process/utils.cc                 | 190 ++++++++++++++----
 engine/utils/process/utils.h                  |  26 ++-
 6 files changed, 213 insertions(+), 70 deletions(-)

diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc
index c2ef779f1..a4bcb1eb5 100644
--- a/engine/cli/commands/server_start_cmd.cc
+++ b/engine/cli/commands/server_start_cmd.cc
@@ -119,10 +119,10 @@ bool ServerStartCmd::Exec(const std::string& host, int port,
   commands.push_back(get_data_folder_path());
   commands.push_back("--loglevel");
   commands.push_back(log_level_);
-  auto pid = cortex::process::SpawnProcess(commands);
-  if (pid < 0) {
+  auto result = cortex::process::SpawnProcess(commands);
+  if (result.has_error()) {
     // Fork failed
-    std::cerr << "Could not start server: " << std::endl;
+    std::cerr << "Could not start server: " << result.error() << std::endl;
     return false;
   } else {
     // Parent process
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 1a7af51a3..7a40545a7 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -72,9 +72,17 @@ cpp::result<void, std::string> DownloadUv(
     command.push_back("install");
     command.push_back("3.10");
 
-    const pid_t pid = cortex::process::SpawnProcess(command, "", "", true);
-    if (pid == -1)
-      return cpp::fail("Fail to spawn process");
+    // NOTE: errors in download callback won't be propagated to caller
+    auto result = cortex::process::SpawnProcess(command);
+    if (result.has_error()) {
+      CTL_ERR(result.error());
+      return;
+    }
+
+    if (!cortex::process::WaitProcess(result.value())) {
+      CTL_ERR("Process spawned but fail to wait");
+      return;
+    }
   };
 
   auto downloadTask = DownloadTask{.id = "python-uv",
@@ -95,8 +103,8 @@ cpp::result<void, std::string> DownloadUv(
 std::string GetUvPath() {
   auto system_info = system_info_utils::GetSystemInfo();
   const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
-  const auto path =
-      file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / bin_name;
+  const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" /
+                    "bin" / bin_name;
   return path.string();
 }
 
@@ -139,18 +147,22 @@ cpp::result<void, std::string> UvDownloadDeps(
     command.push_back(py_cfg.entrypoint[0]);
   }
 
-  const pid_t pid = cortex::process::SpawnProcess(command, "", "", true);
-  if (pid == -1)
-    return cpp::fail("Fail to install dependencies");
+  auto result = cortex::process::SpawnProcess(command);
+  if (result.has_error())
+    return cpp::fail("Fail to install Python dependencies. " + result.error());
+
+  if (!cortex::process::WaitProcess(result.value())) {
+    return cpp::fail("Fail to install Python dependencies.");
+  }
 
   return {};
 }
 
 bool PythonEngine::PythonSubprocess::IsAlive() {
-  return cortex::process::IsProcessAlive(pid);
+  return cortex::process::IsProcessAlive(proc_info);
 }
 bool PythonEngine::PythonSubprocess::Kill() {
-  return cortex::process::KillProcess(pid);
+  return cortex::process::KillProcess(proc_info);
 }
 
 PythonEngine::PythonEngine() {}
@@ -238,15 +250,22 @@ void PythonEngine::LoadModel(
 
     // NOTE: process may start, but exits/crashes later
     // TODO: wait for a few seconds, then check if process is alive
-    pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path);
-    if (pid == -1) {
-      throw std::runtime_error("Fail to spawn process with pid -1");
+    auto result =
+        cortex::process::SpawnProcess(command, stdout_path, stderr_path);
+    if (result.has_error()) {
+      throw std::runtime_error(result.error());
     }
-    const uint64_t start_time =
-        std::chrono::system_clock::now().time_since_epoch() /
-        std::chrono::milliseconds(1);
+
+    PythonSubprocess py_proc;
+    py_proc.proc_info = result.value();
+    py_proc.port = py_cfg.port;
+    py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() /
+                         std::chrono::milliseconds(1);
+
+    pid = py_proc.proc_info.pid;
+
     std::unique_lock write_lock(mutex);
-    model_process_map[model] = {pid, py_cfg.port, start_time};
+    model_process_map[model] = py_proc;
 
   } catch (const std::exception& e) {
     auto e_msg = e.what();
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index 02743ac22..ec7e38d72 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -26,7 +26,7 @@ cpp::result<void, std::string> UvDownloadDeps(
 class PythonEngine : public PythonEngineI {
  private:
   struct PythonSubprocess {
-    pid_t pid;
+    cortex::process::ProcessInfo proc_info;
     int port;
     uint64_t start_time;
 
diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc
index 972647b51..e6bcc89ef 100644
--- a/engine/services/hardware_service.cc
+++ b/engine/services/hardware_service.cc
@@ -197,10 +197,10 @@ bool HardwareService::Restart(const std::string& host, int port) {
   commands.push_back(get_data_folder_path());
   commands.push_back("--loglevel");
   commands.push_back(luh::LogLevelStr(luh::global_log_level));
-  auto pid = cortex::process::SpawnProcess(commands);
-  if (pid < 0) {
+  auto result = cortex::process::SpawnProcess(commands);
+  if (result.has_error()) {
     // Fork failed
-    std::cerr << "Could not start server: " << std::endl;
+    std::cerr << "Could not start server: " << result.error() << std::endl;
     return false;
   } else {
     // Parent process
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 0ca5f8bfb..1cc97e2c2 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -39,9 +39,9 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args) {
   return argv;
 }
 
-pid_t SpawnProcess(const std::vector<std::string>& command,
-                   const std::string stdout_file, const std::string stderr_file,
-                   bool wait) {
+cpp::result<ProcessInfo, std::string> SpawnProcess(
+    const std::vector<std::string>& command, const std::string& stdout_file,
+    const std::string& stderr_file) {
   try {
 #if defined(_WIN32)
     // Windows process creation
@@ -49,6 +49,48 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
     PROCESS_INFORMATION pi = {0};
     si.cb = sizeof(si);
 
+    HANDLE hJob = NULL, hStdOut = NULL, hStdErr = NULL;
+
+    // redirect stdout and stderr
+    if (!stdout_file.empty() || !stderr_file.empty()) {
+      si.dwFlags |= STARTF_USESTDHANDLES;
+
+      // when STARTF_USESTDHANDLES is set, we have to explicitly inherit
+      // parent's handles, otherwise subprocess may successfuly spawn but
+      // exit immediately.
+      si.hStdOutput = GetStdHandle(STD_OUTPUT_HANDLE);
+      si.hStdError = GetStdHandle(STD_ERROR_HANDLE);
+      si.hStdInput = GetStdHandle(STD_INPUT_HANDLE);
+
+      SECURITY_ATTRIBUTES sa;
+      sa.nLength = sizeof(sa);
+      sa.lpSecurityDescriptor = NULL;
+      sa.bInheritHandle = TRUE;
+
+      if (!stdout_file.empty()) {
+        hStdOut = CreateFileA(stdout_file.c_str(), FILE_APPEND_DATA,
+                              FILE_SHARE_READ | FILE_SHARE_WRITE, &sa,
+                              OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+        if (hStdOut == INVALID_HANDLE_VALUE)
+          throw std::runtime_error("Unable to create " + stdout_file +
+                                   " to redirect stdout");
+
+        si.hStdOutput = hStdOut;
+      }
+      if (!stderr_file.empty()) {
+        hStdErr = CreateFileA(stderr_file.c_str(), FILE_APPEND_DATA,
+                              FILE_SHARE_WRITE | FILE_SHARE_READ, &sa,
+                              OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+        if (hStdErr == INVALID_HANDLE_VALUE) {
+          if (hStdOut != NULL)
+            CloseHandle(hStdout) throw std::runtime_error(
+                "Unable to create " + stderr_file + " to redirect stderr");
+        }
+
+        si.hStdError = hStdErr;
+      }
+    }
+
     // Construct command line
     std::string cmd_line = ConstructWindowsCommandLine(command);
 
@@ -56,32 +98,60 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
     char command_buffer[4096];
     strncpy_s(command_buffer, cmd_line.c_str(), sizeof(command_buffer));
 
-    if (!CreateProcessA(NULL,            // lpApplicationName
-                        command_buffer,  // lpCommandLine
-                        NULL,            // lpProcessAttributes
-                        NULL,            // lpThreadAttributes
-                        FALSE,           // bInheritHandles
-                        0,               // dwCreationFlags
-                        NULL,            // lpEnvironment
-                        NULL,            // lpCurrentDirectory
-                        &si,             // lpStartupInfo
-                        &pi              // lpProcessInformation
+    // create a suspended process. we will resume it later after adding it to
+    // a job (see below)
+    if (!CreateProcessA(NULL,              // lpApplicationName
+                        command_buffer,    // lpCommandLine
+                        NULL,              // lpProcessAttributes
+                        NULL,              // lpThreadAttributes
+                        FALSE,             // bInheritHandles
+                        CREATE_SUSPENDED,  // dwCreationFlags
+                        NULL,              // lpEnvironment
+                        NULL,              // lpCurrentDirectory
+                        &si,               // lpStartupInfo
+                        &pi                // lpProcessInformation
                         )) {
+      if (hStdOut != NULL)
+        CloseHandle(hStdOut);
+      if (hStdErr != NULL)
+        CloseHandle(hStdErr);
       throw std::runtime_error("Failed to create process on Windows");
     }
 
-    // Store the process ID
-    pid_t pid = pi.dwProcessId;
+    // https://devblogs.microsoft.com/oldnewthing/20131209-00/?p=2433
+    // resume thread after job object assignment to make sure child processes
+    // will be spawned in the same job object.
+    HANDLE hJob = CreateJobObjectA(NULL, NULL);
+    std::string err_msg;
+    bool success = false;
+    if (!AssignProcessToJobObject(hJob, pi.hProcess)) {
+      err_msg = "Unable to assign process to job object";
+    } else if (ResumeThread(pi.hThread) == (DWORD)(-1)) {
+      err_msg = "Unable to resume thread";
+    } else {
+      success = true;
+    }
 
-    // wait for process to terminate
-    if (wait)
-      WaitForSingleObject(pi.hProcess, INFINITE);
+    // clean up if not successful
+    if (!success) {
+      TerminateProcess(pi.hProcess, 0);
+      CloseHandle(pi.hProcess);
+      CloseHandle(pi.hThread);
+      CloseHandle(hJob);
+      throw std::runtime_error(err_msg);
+    }
 
     // Close handles to avoid resource leaks
     CloseHandle(pi.hProcess);
     CloseHandle(pi.hThread);
 
-    return pid;
+    ProcessInfo proc_info;
+    proc_info.pid = pi.dwProcessId;
+    proc_info.hJob = hJob;
+    proc_info.hStdOut = hStdOut;
+    proc_info.hStdErr = hStdErr;
+
+    return proc_info;
 
 #elif defined(__APPLE__) || defined(__linux__)
     // POSIX process creation
@@ -143,22 +213,25 @@ pid_t SpawnProcess(const std::vector<std::string>& command,
       throw std::runtime_error("Failed to spawn process");
     }
 
-    // wait for process to terminate
-    if (wait)
-      waitpid(pid, NULL, 0);
+    ProcessInfo proc_info;
+    proc_info.pid = pid;
 
-    return pid;
+    return proc_info;
 
 #else
 #error Unsupported platform
 #endif
   } catch (const std::exception& e) {
     LOG_ERROR << "Process spawning error: " << e.what();
-    return -1;
+    return cpp::fail(e.what());
   }
 }
 
-bool IsProcessAlive(pid_t pid) {
+bool IsProcessAlive(const ProcessInfo& proc_info) {
+  if (proc_info.pid == PID_TERMINATED) {
+    return false;
+  }
+
 #ifdef _WIN32
   // Windows implementation
   HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);
@@ -171,7 +244,7 @@ bool IsProcessAlive(pid_t pid) {
 
   if (Process32First(snapshot, &processEntry)) {
     do {
-      if (processEntry.th32ProcessID == pid) {
+      if (processEntry.th32ProcessID == proc_info.pid) {
         CloseHandle(snapshot);
         return true;
       }
@@ -183,13 +256,10 @@ bool IsProcessAlive(pid_t pid) {
 
 #elif defined(__APPLE__) || defined(__linux__)
   // Unix-like systems (Linux and macOS) implementation
-  if (pid <= 0) {
-    return false;
-  }
 
   // Try to send signal 0 to the process
   // This doesn't actually send a signal but checks if we can send signals to the process
-  int result = kill(pid, 0);
+  int result = kill(proc_info.pid, 0);
 
   if (result == 0) {
     return true;  // Process exists and we have permission to send it signals
@@ -201,20 +271,60 @@ bool IsProcessAlive(pid_t pid) {
 #endif
 }
 
-bool KillProcess(pid_t pid) {
+bool WaitProcess(ProcessInfo& proc_info) {
+  if (proc_info.pid == PID_TERMINATED)
+    return true;
+
 #if defined(_WIN32)
-  HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, pid);
-  if (hProcess == NULL) {
-    LOG_ERROR << "Failed to open process";
-    return false;
+  HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, proc_info.pid);
+  bool success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0;
+  CloseHandle(hProcess);
+  if (success) {
+    proc_info.pid = PID_TERMINATED;
+    CloseHandle(proc_info.hJob);
+    proc_info.hJob = NULL;
+  }
+  return success;
+#elif defined(__APPLE__) || defined(__linux__)
+  bool success = waitpid(proc_info.pid, NULL, 0) == proc_info.pid;
+  if (success) {
+    proc_info.pid = PID_TERMINATED;
   }
+  return success;
+#else
+#error "Unsupported platform"
+#endif
+}
 
-  bool is_success = TerminateProcess(hProcess, 0) == TRUE;
-  CloseHandle(hProcess);
-  return is_success;
+bool KillProcess(ProcessInfo& proc_info) {
+  if (proc_info.pid == PID_TERMINATED)
+    return true;
+
+#if defined(_WIN32)
+  bool success = TerminateJobObject(proc_info.hJob, 0) == 0;
+  // clean up resources
+  if (success) {
+    proc_info.pid = PID_TERMINATED;
+    CloseHandle(proc_info.hJob);
+    proc_info.hJob = NULL;
+    if (proc_info.hStdOut != NULL) {
+      CloseHandle(proc_info.hStdOut);
+      proc_info.hStdOut = NULL;
+    }
+    if (proc_info.hStdErr != NULL) {
+      CloseHandle(proc_info.hStdErr);
+      proc_info.hStdErr = NULL;
+    }
+  }
+  return success;
 #elif defined(__APPLE__) || defined(__linux__)
-  // NOTE: should we use SIGKILL here to be consistent with Windows?
-  return kill(pid, SIGTERM) == 0;
+  // we send SIGTERM to subprocess. we trust that this subprocess will
+  // propagate SIGTERM correctly to its children processes.
+  bool success = kill(proc_info.pid, SIGTERM) == 0;
+  if (success) {
+    proc_info.pid = PID_TERMINATED;
+  }
+  return success;
 #else
 #error "Unsupported platform"
 #endif
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index f8719aa68..fcc93be90 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -14,17 +14,31 @@ using pid_t = DWORD;
 
 #include <string>
 #include <vector>
+#include "utils/result.hpp"
 
 namespace cortex::process {
+
+// set pid to this value to signal that this pid should not be used.
+constexpr pid_t PID_TERMINATED = 0;
+
+struct ProcessInfo {
+  pid_t pid;
+#ifdef _WIN32
+  // hJob is used to terminate process and its children.
+  // hStdOut and hStdErr must be manually closed upon process termination.
+  HANDLE hJob, hStdOut, hStdErr;
+#endif
+};
+
 std::string ConstructWindowsCommandLine(const std::vector<std::string>& args);
 
 std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 
-pid_t SpawnProcess(const std::vector<std::string>& command,
-                   const std::string stdout_file = "",
-                   const std::string stderr_file = "",
-                   bool wait = false);
-bool IsProcessAlive(pid_t pid);
-bool KillProcess(pid_t pid);
+cpp::result<ProcessInfo, std::string> SpawnProcess(
+    const std::vector<std::string>& command,
+    const std::string& stdout_file = "", const std::string& stderr_file = "");
+bool IsProcessAlive(const ProcessInfo& proc_info);
+bool WaitProcess(ProcessInfo& proc_info);
+bool KillProcess(ProcessInfo& proc_info);
 
 }  // namespace cortex::process

From f481c2fc79cc7fbe931aa5ef8e6e5864fdfbad2e Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 27 Feb 2025 18:00:53 +0800
Subject: [PATCH 39/73] fix windows

---
 engine/utils/process/utils.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 1cc97e2c2..3b3538b50 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -83,8 +83,10 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
                               OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
         if (hStdErr == INVALID_HANDLE_VALUE) {
           if (hStdOut != NULL)
-            CloseHandle(hStdout) throw std::runtime_error(
-                "Unable to create " + stderr_file + " to redirect stderr");
+            CloseHandle(hStdOut);
+
+          throw std::runtime_error("Unable to create " + stderr_file +
+                                   " to redirect stderr");
         }
 
         si.hStdError = hStdErr;
@@ -121,7 +123,7 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
     // https://devblogs.microsoft.com/oldnewthing/20131209-00/?p=2433
     // resume thread after job object assignment to make sure child processes
     // will be spawned in the same job object.
-    HANDLE hJob = CreateJobObjectA(NULL, NULL);
+    hJob = CreateJobObjectA(NULL, NULL);
     std::string err_msg;
     bool success = false;
     if (!AssignProcessToJobObject(hJob, pi.hProcess)) {
@@ -138,6 +140,10 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
       CloseHandle(pi.hProcess);
       CloseHandle(pi.hThread);
       CloseHandle(hJob);
+      if (hStdOut != NULL)
+        CloseHandle(hStdOut);
+      if (hStdErr != NULL)
+        CloseHandle(hStdErr);
       throw std::runtime_error(err_msg);
     }
 

From 48e2015b5867c9cc5bc9d4c0d1a229eb693f8998 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 27 Feb 2025 18:57:01 +0800
Subject: [PATCH 40/73] fix windows subprocess

---
 engine/utils/process/utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 3b3538b50..1bb27d4c6 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -106,7 +106,7 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
                         command_buffer,    // lpCommandLine
                         NULL,              // lpProcessAttributes
                         NULL,              // lpThreadAttributes
-                        FALSE,             // bInheritHandles
+                        TRUE,              // bInheritHandles
                         CREATE_SUSPENDED,  // dwCreationFlags
                         NULL,              // lpEnvironment
                         NULL,              // lpCurrentDirectory
@@ -282,7 +282,7 @@ bool WaitProcess(ProcessInfo& proc_info) {
     return true;
 
 #if defined(_WIN32)
-  HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, proc_info.pid);
+  HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, proc_info.pid);
   bool success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0;
   CloseHandle(hProcess);
   if (success) {

From f02fc93e17a1edd5b6248e3e6280d5d537fd1674 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 27 Feb 2025 22:12:59 +0800
Subject: [PATCH 41/73] update test

---
 engine/e2e-test/api/engines/test_api_engine.py                | 4 ++--
 .../e2e-test/api/engines/test_api_engine_install_nightly.py   | 2 +-
 engine/e2e-test/cli/engines/test_cli_engine_install.py        | 4 ++--
 .../e2e-test/cli/engines/test_cli_engine_install_nightly.py   | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py
index 61c0f72e5..f563be84d 100644
--- a/engine/e2e-test/api/engines/test_api_engine.py
+++ b/engine/e2e-test/api/engines/test_api_engine.py
@@ -54,12 +54,12 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self):
 
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_python_should_be_successful(self):
-        response = requests.post("http://localhost:3928/v1/engines/python/install")
+        response = requests.post("http://localhost:3928/v1/engines/python-engine/install")
         assert response.status_code == 200
         await wait_for_websocket_download_success_event(timeout=None)
         time.sleep(30)
 
-        response = requests.delete("http://localhost:3928/v1/engines/python/install")
+        response = requests.delete("http://localhost:3928/v1/engines/python-engine/install")
         assert response.status_code == 200
 
     @pytest.mark.asyncio
diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
index 22aa669ee..ca7aa0870 100644
--- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
+++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
@@ -23,7 +23,7 @@ def test_engines_install_llamacpp_should_be_successful(self):
         assert response.status_code == 200
 
     def test_engines_install_python_should_be_successful(self):
-        response = requests.post("http://localhost:3928/v1/engines/python/install")
+        response = requests.post("http://localhost:3928/v1/engines/python-engine/install")
         assert response.status_code == 200
 
     def test_engines_install_llamacpp_specific_version_and_variant(self):
diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py
index ed2359248..ca298c828 100644
--- a/engine/e2e-test/cli/engines/test_cli_engine_install.py
+++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py
@@ -34,11 +34,11 @@ def test_engines_install_llamacpp_should_be_successfully(self):
     def test_engines_install_python_should_be_successfully(self):
         exit_code, output, error = run(
             "Install Engine",
-            ["engines", "install", "python"],
+            ["engines", "install", "python-engine"],
             timeout=None,
             capture=False,
         )
-        response = requests.get("http://127.0.0.1:3928/v1/engines/python")
+        response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine")
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py
index cd09c1542..b3fa6ee26 100644
--- a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py
+++ b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py
@@ -34,11 +34,11 @@ def test_engines_install_llamacpp_should_be_successfully(self):
     def test_engines_install_python_should_be_successfully(self):
         exit_code, output, error = run(
             "Install Engine",
-            ["engines", "install", "python"],
+            ["engines", "install", "python-engine"],
             timeout=None,
             capture=False,
         )
-        response = requests.get("http://127.0.0.1:3928/v1/engines/python")
+        response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine")
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 

From 9918672fb4abc11fa4b36340275212f68627c995 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 28 Feb 2025 09:26:53 +0800
Subject: [PATCH 42/73] more robust checks and cleanup

---
 .../extensions/python-engine/python_engine.cc |  47 ++++----
 engine/utils/process/utils.cc                 | 100 +++++++++++-------
 engine/utils/process/utils.h                  |   2 +-
 3 files changed, 86 insertions(+), 63 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 7a40545a7..5565da1a1 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -213,14 +213,20 @@ void PythonEngine::LoadModel(
   const std::string model = (*json_body)["model"].asString();
   const fs::path model_dir = (*json_body)["model_dir"].asString();
 
-  // TODO: check if model is still alive
   {
-    std::shared_lock read_lock(mutex);
+    std::unique_lock write_lock(mutex);
     if (model_process_map.find(model) != model_process_map.end()) {
-      auto [status, error] =
-          CreateResponse("Model already loaded!", k409Conflict);
-      callback(std::move(status), std::move(error));
-      return;
+      // check if model is still alive
+      if (model_process_map[model].IsAlive()) {
+        auto [status, error] =
+            CreateResponse("Model already loaded!", k409Conflict);
+        callback(std::move(status), std::move(error));
+        return;
+      } else {
+        // if model has exited, try to load model again
+        CTL_WRN("Model " << model << " has exited unexpectedly");
+        model_process_map.erase(model);
+      }
     }
   }
 
@@ -248,8 +254,6 @@ void PythonEngine::LoadModel(
     if (!std::filesystem::exists(stderr_path))
       std::ofstream(stderr_path).flush();
 
-    // NOTE: process may start, but exits/crashes later
-    // TODO: wait for a few seconds, then check if process is alive
     auto result =
         cortex::process::SpawnProcess(command, stdout_path, stderr_path);
     if (result.has_error()) {
@@ -308,13 +312,12 @@ void PythonEngine::UnloadModel(
     std::unique_lock write_lock(mutex);
 
     // check if subprocess is still alive
+    // NOTE: is this step necessary? the subprocess could have terminated
+    // after .IsAlive() and before .Kill() later.
     if (!model_process_map[model].IsAlive()) {
+      model_process_map.erase(model);
       const std::string msg = "Model " + model + " stopped running.";
       auto [status, error] = CreateResponse(msg, k400BadRequest);
-
-      // NOTE: do we need to do any other cleanup for subprocesses?
-      model_process_map.erase(model);
-
       callback(std::move(status), std::move(error));
       return;
     }
@@ -327,7 +330,6 @@ void PythonEngine::UnloadModel(
       return;
     }
 
-    // NOTE: do we need to do any other cleanup for subprocesses?
     model_process_map.erase(model);
   }
 
@@ -366,12 +368,10 @@ void PythonEngine::GetModelStatus(
 
     // check if subprocess is still alive
     if (!model_process_map[model].IsAlive()) {
+      CTL_WRN("Model " << model << " has exited unexpectedly.");
+      model_process_map.erase(model);
       const std::string msg = "Model " + model + " stopped running.";
       auto [status, error] = CreateResponse(msg, k400BadRequest);
-
-      // NOTE: do we need to do any other cleanup for subprocesses?
-      model_process_map.erase(model);
-
       callback(std::move(status), std::move(error));
       return;
     }
@@ -390,9 +390,14 @@ void PythonEngine::GetModels(
 
   Json::Value res, model_list(Json::arrayValue), status;
   {
-    std::shared_lock read_lock(mutex);
-    for (const auto& [model_name, py_proc] : model_process_map) {
-      // TODO: check if py_proc is still alive
+    std::unique_lock write_lock(mutex);
+    for (auto& [model_name, py_proc] : model_process_map) {
+      if (!py_proc.IsAlive()) {
+        CTL_WRN("Model " << model_name << " has exited unexpectedly.");
+        model_process_map.erase(model_name);
+        continue;
+      }
+
       Json::Value val;
       val["id"] = model_name;
       val["engine"] = kPythonEngine;
@@ -433,7 +438,7 @@ cpp::result<int, std::string> PythonEngine::GetPort(const std::string& model) {
   {
     std::unique_lock write_lock(mutex);
     if (!model_process_map[model].IsAlive()) {
-      // NOTE: do we need to do any other cleanup for subprocesses?
+      CTL_WRN("Model " << model << " has exited unexpectedly.");
       model_process_map.erase(model);
       return cpp::fail("Model " + model + " stopped running.");
     }
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 1bb27d4c6..19b942d82 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -233,10 +233,30 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
   }
 }
 
-bool IsProcessAlive(const ProcessInfo& proc_info) {
-  if (proc_info.pid == PID_TERMINATED) {
-    return false;
+static void SetProcessTerminated(ProcessInfo& proc_info) {
+  if (proc_info.pid == PID_TERMINATED)
+    return;
+
+  proc_info.pid = PID_TERMINATED;
+
+  // close handles on Windows
+#if defined(_WIN32)
+  CloseHandle(proc_info.hJob);
+  proc_info.hJob = NULL;
+  if (proc_info.hStdOut != NULL) {
+    CloseHandle(proc_info.hStdOut);
+    proc_info.hStdOut = NULL;
+  }
+  if (proc_info.hStdErr != NULL) {
+    CloseHandle(proc_info.hStdErr);
+    proc_info.hStdErr = NULL;
   }
+#endif
+}
+
+bool IsProcessAlive(ProcessInfo& proc_info) {
+  if (proc_info.pid == PID_TERMINATED)
+    return false;
 
 #ifdef _WIN32
   // Windows implementation
@@ -257,21 +277,33 @@ bool IsProcessAlive(const ProcessInfo& proc_info) {
     } while (Process32Next(snapshot, &processEntry));
   }
 
+  // pid not found in snapshot -> process has terminated.
   CloseHandle(snapshot);
+  SetProcessTerminated(proc_info);
   return false;
 
 #elif defined(__APPLE__) || defined(__linux__)
   // Unix-like systems (Linux and macOS) implementation
 
+  // NOTE: this approach only works if the process has been reaped.
+  // if the process has terminated but not reaped (exit status is still
+  // stored in the process table), kill(pid, 0) still returns 0.
+
   // Try to send signal 0 to the process
   // This doesn't actually send a signal but checks if we can send signals to the process
-  int result = kill(proc_info.pid, 0);
+  // Process exists and we have permission to send it signals
+  // if (kill(proc_info.pid, 0) == 0) {
+  //   return true;
+  // }
 
-  if (result == 0) {
-    return true;  // Process exists and we have permission to send it signals
-  }
+  // // process exists but we don't have permission to send signal
+  // if (errno == EPERM)
+  //   return true;
 
-  return errno != ESRCH;  // ESRCH means "no such process"
+  if (waitpid(proc_info.pid, NULL, WNOHANG) == 0)
+    return true;
+  SetProcessTerminated(proc_info);
+  return false;
 #else
 #error "Unsupported platform"
 #endif
@@ -281,59 +313,45 @@ bool WaitProcess(ProcessInfo& proc_info) {
   if (proc_info.pid == PID_TERMINATED)
     return true;
 
+  bool success;
+
 #if defined(_WIN32)
+  // NOTE: OpenProcess() may fail if the process has terminated.
   HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, proc_info.pid);
-  bool success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0;
+  success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0;
   CloseHandle(hProcess);
-  if (success) {
-    proc_info.pid = PID_TERMINATED;
-    CloseHandle(proc_info.hJob);
-    proc_info.hJob = NULL;
-  }
-  return success;
 #elif defined(__APPLE__) || defined(__linux__)
-  bool success = waitpid(proc_info.pid, NULL, 0) == proc_info.pid;
-  if (success) {
-    proc_info.pid = PID_TERMINATED;
-  }
-  return success;
+  // NOTE: waitpid() may fail if the process has terminated and the OS
+  // has reaped it (i.e. clear its exit status).
+  success = waitpid(proc_info.pid, NULL, 0) == proc_info.pid;
 #else
 #error "Unsupported platform"
 #endif
+
+  if (success)
+    SetProcessTerminated(proc_info);
+  return success;
 }
 
 bool KillProcess(ProcessInfo& proc_info) {
   if (proc_info.pid == PID_TERMINATED)
     return true;
 
+  bool success;
+
 #if defined(_WIN32)
-  bool success = TerminateJobObject(proc_info.hJob, 0) == 0;
-  // clean up resources
-  if (success) {
-    proc_info.pid = PID_TERMINATED;
-    CloseHandle(proc_info.hJob);
-    proc_info.hJob = NULL;
-    if (proc_info.hStdOut != NULL) {
-      CloseHandle(proc_info.hStdOut);
-      proc_info.hStdOut = NULL;
-    }
-    if (proc_info.hStdErr != NULL) {
-      CloseHandle(proc_info.hStdErr);
-      proc_info.hStdErr = NULL;
-    }
-  }
-  return success;
+  success = TerminateJobObject(proc_info.hJob, 0) == 0;
 #elif defined(__APPLE__) || defined(__linux__)
   // we send SIGTERM to subprocess. we trust that this subprocess will
   // propagate SIGTERM correctly to its children processes.
-  bool success = kill(proc_info.pid, SIGTERM) == 0;
-  if (success) {
-    proc_info.pid = PID_TERMINATED;
-  }
-  return success;
+  success = kill(proc_info.pid, SIGTERM) == 0;
 #else
 #error "Unsupported platform"
 #endif
+
+  if (success)
+    SetProcessTerminated(proc_info);
+  return success;
 }
 
 }  // namespace cortex::process
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index fcc93be90..19b821cef 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -37,7 +37,7 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 cpp::result<ProcessInfo, std::string> SpawnProcess(
     const std::vector<std::string>& command,
     const std::string& stdout_file = "", const std::string& stderr_file = "");
-bool IsProcessAlive(const ProcessInfo& proc_info);
+bool IsProcessAlive(ProcessInfo& proc_info);
 bool WaitProcess(ProcessInfo& proc_info);
 bool KillProcess(ProcessInfo& proc_info);
 

From 99a0035f0d8d171938d22a3faa474d3d870c1c08 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 3 Mar 2025 14:41:12 +0800
Subject: [PATCH 43/73] support engines uninstall

---
 .../extensions/python-engine/python_engine.cc | 23 +++++++-------
 .../extensions/python-engine/python_engine.h  |  5 ++--
 engine/services/engine_service.cc             | 30 +++++++++++--------
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 5565da1a1..4f395c821 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -15,6 +15,10 @@ constexpr const int k409Conflict = 409;
 constexpr const int k500InternalServerError = 500;
 }  // namespace
 
+std::filesystem::path GetPythonEnginePath() {
+  return file_manager_utils::GetCortexDataPath() / "python_engine";
+}
+
 cpp::result<void, std::string> DownloadUv(
     std::shared_ptr<DownloadService>& download_service) {
   const auto py_bin_path =
@@ -100,25 +104,18 @@ cpp::result<void, std::string> DownloadUv(
   return {};
 }
 
-std::string GetUvPath() {
+std::filesystem::path GetUvPath() {
   auto system_info = system_info_utils::GetSystemInfo();
   const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
-  const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" /
-                    "bin" / bin_name;
-  return path.string();
-}
-
-// use our own cache dir so that when users delete cortexcpp/, everything is deleted.
-std::string GetUvCacheDir() {
-  const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" /
-                    "cache" / "uv";
-  return path.string();
+  return GetPythonEnginePath() / "bin" / bin_name;
 }
 
 std::vector<std::string> BuildUvCommand(const std::string& action,
                                         const std::string& directory) {
-  std::vector<std::string> command = {GetUvPath(), "--cache-dir",
-                                      GetUvCacheDir()};
+  // use our own cache dir so that when users delete cortexcpp/, everything is deleted.
+  const auto cache_dir = GetPythonEnginePath() / "cache" / "uv";
+  std::vector<std::string> command = {GetUvPath().string(), "--cache-dir",
+                                      cache_dir.string()};
   if (!directory.empty()) {
     command.push_back("--directory");
     command.push_back(directory);
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
index ec7e38d72..b7d207921 100644
--- a/engine/extensions/python-engine/python_engine.h
+++ b/engine/extensions/python-engine/python_engine.h
@@ -12,11 +12,12 @@
 
 namespace python_engine {
 
+std::filesystem::path GetPythonEnginePath();
+
 // UV-related functions
 cpp::result<void, std::string> DownloadUv(
     std::shared_ptr<DownloadService>& download_service);
-std::string GetUvPath();
-std::string GetUvCacheDir();
+std::filesystem::path GetUvPath();
 std::vector<std::string> BuildUvCommand(const std::string& action,
                                         const std::string& directory = "");
 bool IsUvInstalled();
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index ac0c9eae9..70b031ccb 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -191,19 +191,25 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
   }
 
   std::optional<std::filesystem::path> path_to_remove = std::nullopt;
-  if (version == std::nullopt && variant == std::nullopt) {
-    // if no version and variant provided, remove all engines variant of that engine
-    path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne;
-  } else if (version != std::nullopt && variant != std::nullopt) {
-    // if both version and variant are provided, we only remove that variant
-    path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne /
-                     variant.value() / version.value();
-  } else if (version == std::nullopt) {
-    // if only have variant, we remove all of that variant
-    path_to_remove =
-        file_manager_utils::GetEnginesContainerPath() / ne / variant.value();
+
+  // Python engine is stored in a separate folder
+  if (ne == kPythonEngine) {
+    path_to_remove = python_engine::GetPythonEnginePath();
   } else {
-    return cpp::fail("No variant provided");
+    if (version == std::nullopt && variant == std::nullopt) {
+      // if no version and variant provided, remove all engines variant of that engine
+      path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne;
+    } else if (version != std::nullopt && variant != std::nullopt) {
+      // if both version and variant are provided, we only remove that variant
+      path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne /
+                       variant.value() / version.value();
+    } else if (version == std::nullopt) {
+      // if only have variant, we remove all of that variant
+      path_to_remove =
+          file_manager_utils::GetEnginesContainerPath() / ne / variant.value();
+    } else {
+      return cpp::fail("No variant provided");
+    }
   }
 
   if (path_to_remove == std::nullopt) {

From b96fd6957f26a86c9895f8678b403552130ec8d2 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 3 Mar 2025 15:06:25 +0800
Subject: [PATCH 44/73] follow reverse proxy example

---
 engine/controllers/server.cc | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
index ebc8639de..ea88f5882 100644
--- a/engine/controllers/server.cc
+++ b/engine/controllers/server.cc
@@ -240,24 +240,27 @@ void server::Python(const HttpRequestPtr& req,
   // route request. localhost might not work?
   const int port = port_result.value();
   const std::string host = "http://127.0.0.1:" + std::to_string(port);
-  auto client = HttpClient::newHttpClient(host);
+  CTL_INF("Route request to " << host << path);
 
-  auto new_req = HttpRequest::newHttpRequest();
-  new_req->setMethod(req->method());
-  new_req->setPath(path);
-  new_req->setBody(std::string{req->body()});
-  new_req->setContentTypeCode(req->getContentType());
+  // https://github.com/drogonframework/drogon/blob/v1.9.10/examples/simple_reverse_proxy/plugins/SimpleReverseProxy.cc
+  auto client = HttpClient::newHttpClient(
+      host, trantor::EventLoop::getEventLoopOfCurrentThread());
 
-  // including headers may make FastAPI reqject the request...
-  // for (const auto& [field, value] : req->headers()) {
-  //   new_req->addHeader(field, value);
-  // }
+  // NOTE: modify request object inplace
+  req->setPassThrough(true);
+  req->setPath(path);
 
-  CTL_INF("Route request to " << host << path);
-  auto cb = [callback](ReqResult result, const HttpResponsePtr& response) {
-    callback(response);
-  };
-  client->sendRequest(new_req, cb);
+  client->sendRequest(req, [callback = std::move(callback)](
+                               ReqResult result, const HttpResponsePtr& resp) {
+    if (result == ReqResult::Ok) {
+      resp->setPassThrough(true);
+      callback(resp);
+    } else {
+      auto errResp = HttpResponse::newHttpResponse();
+      errResp->setStatusCode(k500InternalServerError);
+      callback(errResp);
+    }
+  });
 }
 
 void server::LoadModel(const HttpRequestPtr& req,
@@ -277,7 +280,7 @@ void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
   auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id](
                                       char* buf,
-                                       std::size_t buf_size) -> std::size_t {
+                                      std::size_t buf_size) -> std::size_t {
     if (buf == nullptr) {
       LOG_TRACE << "Buf is null";
       if (!(*err_or_done)) {

From e2e2cccf658041ceded90b8c5d0908863c98db5e Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 3 Mar 2025 21:25:54 +0800
Subject: [PATCH 45/73] update uv to 0.6.3

---
 engine/extensions/python-engine/python_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
index 4f395c821..d9b3ae485 100644
--- a/engine/extensions/python-engine/python_engine.cc
+++ b/engine/extensions/python-engine/python_engine.cc
@@ -26,7 +26,7 @@ cpp::result<void, std::string> DownloadUv(
   std::filesystem::create_directories(py_bin_path);
 
   // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
-  const std::string uv_version = "0.6.2";
+  const std::string uv_version = "0.6.3";
 
   // build download url based on system info
   std::stringstream fname_stream;

From 57c30d381a06dc1f17c36892e8d63af8c9bd458d Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 3 Mar 2025 21:26:10 +0800
Subject: [PATCH 46/73] support engines list

---
 engine/services/engine_service.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 70b031ccb..076fd02c1 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -629,6 +629,23 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const {
   auto ne = NormalizeEngine(engine);
   auto os = hw_inf_.sys_inf->os;
 
+  if (ne == kPythonEngine) {
+    if (!python_engine::IsUvInstalled()) {
+      return {};
+    } else {
+      // Python engine only means uv is installed.
+      // variant name and version don't quite make sense in this context.
+      // hence, they are left blank.
+      std::vector<EngineVariantResponse> variants;
+      variants.push_back(EngineVariantResponse{
+          .name = "",
+          .version = "",
+          .engine = kPythonEngine,
+      });
+      return variants;
+    }
+  }
+
   auto engines_variants_dir =
       file_manager_utils::GetEnginesContainerPath() / ne;
 

From 49df6af1afc88df8398ab684795c0521507c32ce Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 17 Mar 2025 12:00:41 +0800
Subject: [PATCH 47/73] remove checks against supportedEngines

---
 engine/cli/command_line_parser.cc | 94 +++++++++----------------------
 engine/cli/command_line_parser.h  |  6 +-
 2 files changed, 29 insertions(+), 71 deletions(-)

diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc
index b423a6896..c2348caee 100644
--- a/engine/cli/command_line_parser.cc
+++ b/engine/cli/command_line_parser.cc
@@ -482,36 +482,49 @@ void CommandLineParser::SetupEngineCommands() {
   install_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                      " engines install [engine_name] [options]");
   install_cmd->group(kSubcommands);
+  install_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
+  install_cmd->add_option("-v, --version", cml_data_.engine_version,
+                          "Engine version to download");
+  install_cmd->add_option("-s, --source", cml_data_.engine_src,
+                          "Install engine by local path");
+  install_cmd->add_flag("-m, --menu", cml_data_.show_menu,
+                        "Display menu for engine variant selection");
+
   install_cmd->callback([this, install_cmd] {
     if (std::exchange(executed_, true))
       return;
-    if (install_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(install_cmd->help());
+    try {
+      commands::EngineInstallCmd(
+          engine_service_, cml_data_.config.apiServerHost,
+          std::stoi(cml_data_.config.apiServerPort), cml_data_.show_menu)
+          .Exec(cml_data_.engine_name, cml_data_.engine_version,
+                cml_data_.engine_src);
+    } catch (const std::exception& e) {
+      CTL_ERR(e.what());
     }
   });
 
-  for (const auto& engine : supported_engines_) {
-    EngineInstall(install_cmd, engine, cml_data_.engine_version,
-                  cml_data_.engine_src);
-  }
-
   auto uninstall_cmd =
       engines_cmd->add_subcommand("uninstall", "Uninstall engine");
   uninstall_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                        " engines uninstall [engine_name] [options]");
+  uninstall_cmd->group(kSubcommands);
+  uninstall_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
   uninstall_cmd->callback([this, uninstall_cmd] {
     if (std::exchange(executed_, true))
       return;
-    if (uninstall_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(uninstall_cmd->help());
+    try {
+      commands::EngineUninstallCmd().Exec(
+          cml_data_.config.apiServerHost,
+          std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name);
+    } catch (const std::exception& e) {
+      CTL_ERR(e.what());
     }
   });
-  uninstall_cmd->group(kSubcommands);
-  for (const auto& engine : supported_engines_) {
-    EngineUninstall(uninstall_cmd, engine);
-  }
 
   auto engine_upd_cmd = engines_cmd->add_subcommand("update", "Update engine");
   engine_upd_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
@@ -726,57 +739,6 @@ void CommandLineParser::SetupSystemCommands() {
   });
 }
 
-void CommandLineParser::EngineInstall(CLI::App* parent,
-                                      const std::string& engine_name,
-                                      std::string& version, std::string& src) {
-  auto install_engine_cmd = parent->add_subcommand(engine_name, "");
-  install_engine_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
-                            " engines install " + engine_name + " [options]");
-  install_engine_cmd->group(kEngineGroup);
-
-  install_engine_cmd->add_option("-v, --version", version,
-                                 "Engine version to download");
-
-  install_engine_cmd->add_option("-s, --source", src,
-                                 "Install engine by local path");
-
-  install_engine_cmd->add_flag("-m, --menu", cml_data_.show_menu,
-                               "Display menu for engine variant selection");
-
-  install_engine_cmd->callback([this, engine_name, &version, &src] {
-    if (std::exchange(executed_, true))
-      return;
-    try {
-      commands::EngineInstallCmd(
-          engine_service_, cml_data_.config.apiServerHost,
-          std::stoi(cml_data_.config.apiServerPort), cml_data_.show_menu)
-          .Exec(engine_name, version, src);
-    } catch (const std::exception& e) {
-      CTL_ERR(e.what());
-    }
-  });
-}
-
-void CommandLineParser::EngineUninstall(CLI::App* parent,
-                                        const std::string& engine_name) {
-  auto uninstall_engine_cmd = parent->add_subcommand(engine_name, "");
-  uninstall_engine_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
-                              " engines install " + engine_name + " [options]");
-  uninstall_engine_cmd->group(kEngineGroup);
-
-  uninstall_engine_cmd->callback([this, engine_name] {
-    if (std::exchange(executed_, true))
-      return;
-    try {
-      commands::EngineUninstallCmd().Exec(
-          cml_data_.config.apiServerHost,
-          std::stoi(cml_data_.config.apiServerPort), engine_name);
-    } catch (const std::exception& e) {
-      CTL_ERR(e.what());
-    }
-  });
-}
-
 void CommandLineParser::EngineUpdate(CLI::App* parent,
                                      const std::string& engine_name) {
   auto engine_update_cmd = parent->add_subcommand(engine_name, "");
diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h
index 5b64f7f4d..0fce8cc9b 100644
--- a/engine/cli/command_line_parser.h
+++ b/engine/cli/command_line_parser.h
@@ -25,11 +25,6 @@ class CommandLineParser {
 
   void SetupConfigsCommands();
 
-  void EngineInstall(CLI::App* parent, const std::string& engine_name,
-                     std::string& version, std::string& src);
-
-  void EngineUninstall(CLI::App* parent, const std::string& engine_name);
-
   void EngineUpdate(CLI::App* parent, const std::string& engine_name);
 
   void EngineGet(CLI::App* parent);
@@ -54,6 +49,7 @@ class CommandLineParser {
     std::string msg;
     std::string model_alias;
     std::string model_path;
+    std::string engine_name;
     std::string engine_version = "latest";
     std::string engine_src;
     std::string cortex_version;

From f1dcdde8c7764b7d2cb9e15640df7a0729c2adc6 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 17 Mar 2025 12:12:10 +0800
Subject: [PATCH 48/73] remove supportedEngines check for more commands

---
 engine/cli/command_line_parser.cc | 196 ++++++++----------------------
 engine/cli/command_line_parser.h  |  11 --
 2 files changed, 54 insertions(+), 153 deletions(-)

diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc
index c2348caee..4afb48360 100644
--- a/engine/cli/command_line_parser.cc
+++ b/engine/cli/command_line_parser.cc
@@ -51,9 +51,7 @@ CommandLineParser::CommandLineParser()
       dylib_path_manager_{std::make_shared<cortex::DylibPathManager>()},
       db_service_{std::make_shared<DatabaseService>()},
       engine_service_{std::make_shared<EngineService>(
-          download_service_, dylib_path_manager_, db_service_)} {
-  supported_engines_ = engine_service_->GetSupportedEngineNames().value();
-}
+          download_service_, dylib_path_manager_, db_service_)} {}
 
 bool CommandLineParser::SetupCommand(int argc, char** argv) {
   app_.usage("Usage:\n" + commands::GetCortexBinary() +
@@ -529,70 +527,94 @@ void CommandLineParser::SetupEngineCommands() {
   auto engine_upd_cmd = engines_cmd->add_subcommand("update", "Update engine");
   engine_upd_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                         " engines update [engine_name]");
+  engine_upd_cmd->group(kSubcommands);
+  engine_upd_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
   engine_upd_cmd->callback([this, engine_upd_cmd] {
     if (std::exchange(executed_, true))
       return;
-    if (engine_upd_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(engine_upd_cmd->help());
+    try {
+      commands::EngineUpdateCmd().Exec(
+          cml_data_.config.apiServerHost,
+          std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name);
+    } catch (const std::exception& e) {
+      CTL_ERR(e.what());
     }
   });
-  engine_upd_cmd->group(kSubcommands);
-  for (const auto& engine : supported_engines_) {
-    EngineUpdate(engine_upd_cmd, engine);
-  }
 
   auto engine_use_cmd =
       engines_cmd->add_subcommand("use", "Set engine as default");
   engine_use_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                         " engines use [engine_name]");
+  engine_use_cmd->group(kSubcommands);
+  engine_use_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
   engine_use_cmd->callback([this, engine_use_cmd] {
     if (std::exchange(executed_, true))
       return;
-    if (engine_use_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(engine_use_cmd->help());
+    auto result = commands::EngineUseCmd().Exec(
+        cml_data_.config.apiServerHost,
+        std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name);
+    if (result.has_error()) {
+      CTL_ERR(result.error());
+    } else {
+      CTL_INF("Engine " << cml_data_.engine_name << " is set as default");
     }
   });
-  engine_use_cmd->group(kSubcommands);
-  for (const auto& engine : supported_engines_) {
-    EngineUse(engine_use_cmd, engine);
-  }
 
   auto engine_load_cmd = engines_cmd->add_subcommand("load", "Load engine");
   engine_load_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                          " engines load [engine_name]");
+  engine_load_cmd->group(kSubcommands);
+  engine_load_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
   engine_load_cmd->callback([this, engine_load_cmd] {
     if (std::exchange(executed_, true))
       return;
-    if (engine_load_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(engine_load_cmd->help());
+    auto result = commands::EngineLoadCmd().Exec(
+        cml_data_.config.apiServerHost,
+        std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name);
+    if (result.has_error()) {
+      CTL_ERR(result.error());
     }
   });
-  engine_load_cmd->group(kSubcommands);
-  for (const auto& engine : supported_engines_) {
-    EngineLoad(engine_load_cmd, engine);
-  }
 
   auto engine_unload_cmd =
       engines_cmd->add_subcommand("unload", "Unload engine");
   engine_unload_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                            " engines unload [engine_name]");
+  engine_unload_cmd->group(kSubcommands);
+  engine_unload_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
   engine_unload_cmd->callback([this, engine_unload_cmd] {
     if (std::exchange(executed_, true))
       return;
-    if (engine_unload_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(engine_unload_cmd->help());
+    auto result = commands::EngineUnloadCmd().Exec(
+        cml_data_.config.apiServerHost,
+        std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name);
+    if (result.has_error()) {
+      CTL_ERR(result.error());
     }
   });
-  engine_unload_cmd->group(kSubcommands);
-  for (const auto& engine : supported_engines_) {
-    EngineUnload(engine_unload_cmd, engine);
-  }
 
-  EngineGet(engines_cmd);
+  auto engine_get_cmd = engines_cmd->add_subcommand("get", "Get engine info");
+  engine_get_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
+                        " engines get [engine_name] [options]");
+  engine_get_cmd->group(kSubcommands);
+  engine_get_cmd
+      ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp")
+      ->required();
+  engine_get_cmd->callback([this, engine_get_cmd] {
+    if (std::exchange(executed_, true))
+      return;
+    commands::EngineGetCmd().Exec(cml_data_.config.apiServerHost,
+                                  std::stoi(cml_data_.config.apiServerPort),
+                                  cml_data_.engine_name);
+  });
 }
 
 void CommandLineParser::SetupHardwareCommands() {
@@ -739,116 +761,6 @@ void CommandLineParser::SetupSystemCommands() {
   });
 }
 
-void CommandLineParser::EngineUpdate(CLI::App* parent,
-                                     const std::string& engine_name) {
-  auto engine_update_cmd = parent->add_subcommand(engine_name, "");
-  engine_update_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
-                           " engines update " + engine_name);
-  engine_update_cmd->group(kEngineGroup);
-
-  engine_update_cmd->callback([this, engine_name] {
-    if (std::exchange(executed_, true))
-      return;
-    try {
-      commands::EngineUpdateCmd().Exec(
-          cml_data_.config.apiServerHost,
-          std::stoi(cml_data_.config.apiServerPort), engine_name);
-    } catch (const std::exception& e) {
-      CTL_ERR(e.what());
-    }
-  });
-}
-
-void CommandLineParser::EngineUnload(CLI::App* parent,
-                                     const std::string& engine_name) {
-  auto sub_cmd = parent->add_subcommand(engine_name, "");
-  sub_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines unload " +
-                 engine_name);
-  sub_cmd->group(kEngineGroup);
-
-  sub_cmd->callback([this, engine_name] {
-    if (std::exchange(executed_, true))
-      return;
-    auto result = commands::EngineUnloadCmd().Exec(
-        cml_data_.config.apiServerHost,
-        std::stoi(cml_data_.config.apiServerPort), engine_name);
-    if (result.has_error()) {
-      CTL_ERR(result.error());
-    }
-  });
-}
-
-void CommandLineParser::EngineLoad(CLI::App* parent,
-                                   const std::string& engine_name) {
-  auto sub_cmd = parent->add_subcommand(engine_name, "");
-  sub_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines load " +
-                 engine_name);
-  sub_cmd->group(kEngineGroup);
-
-  sub_cmd->callback([this, engine_name] {
-    if (std::exchange(executed_, true))
-      return;
-    auto result = commands::EngineLoadCmd().Exec(
-        cml_data_.config.apiServerHost,
-        std::stoi(cml_data_.config.apiServerPort), engine_name);
-    if (result.has_error()) {
-      CTL_ERR(result.error());
-    }
-  });
-}
-
-void CommandLineParser::EngineUse(CLI::App* parent,
-                                  const std::string& engine_name) {
-  auto engine_use_cmd = parent->add_subcommand(engine_name, "");
-  engine_use_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
-                        " engines use " + engine_name);
-  engine_use_cmd->group(kEngineGroup);
-
-  engine_use_cmd->callback([this, engine_name] {
-    if (std::exchange(executed_, true))
-      return;
-    auto result = commands::EngineUseCmd().Exec(
-        cml_data_.config.apiServerHost,
-        std::stoi(cml_data_.config.apiServerPort), engine_name);
-    if (result.has_error()) {
-      CTL_ERR(result.error());
-    } else {
-      CTL_INF("Engine " << engine_name << " is set as default");
-    }
-  });
-}
-
-void CommandLineParser::EngineGet(CLI::App* parent) {
-  auto get_cmd = parent->add_subcommand("get", "Get engine info");
-  get_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
-                 " engines get [engine_name] [options]");
-  get_cmd->group(kSubcommands);
-  get_cmd->callback([this, get_cmd] {
-    if (std::exchange(executed_, true))
-      return;
-    if (get_cmd->get_subcommands().empty()) {
-      CLI_LOG("[engine_name] is required\n");
-      CLI_LOG(get_cmd->help());
-    }
-  });
-
-  for (const auto& engine : supported_engines_) {
-    std::string desc = "Get " + engine + " status";
-
-    auto engine_get_cmd = get_cmd->add_subcommand(engine, desc);
-    engine_get_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
-                          " engines get " + engine + " [options]");
-    engine_get_cmd->group(kEngineGroup);
-    engine_get_cmd->callback([this, engine] {
-      if (std::exchange(executed_, true))
-        return;
-      commands::EngineGetCmd().Exec(cml_data_.config.apiServerHost,
-                                    std::stoi(cml_data_.config.apiServerPort),
-                                    engine);
-    });
-  }
-}
-
 void CommandLineParser::ModelUpdate(CLI::App* parent) {
   auto model_update_cmd =
       parent->add_subcommand("update", "Update model configurations");
diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h
index 0fce8cc9b..7a10db757 100644
--- a/engine/cli/command_line_parser.h
+++ b/engine/cli/command_line_parser.h
@@ -25,16 +25,6 @@ class CommandLineParser {
 
   void SetupConfigsCommands();
 
-  void EngineUpdate(CLI::App* parent, const std::string& engine_name);
-
-  void EngineGet(CLI::App* parent);
-
-  void EngineUse(CLI::App* parent, const std::string& engine_name);
-
-  void EngineLoad(CLI::App* parent, const std::string& engine_name);
-
-  void EngineUnload(CLI::App* parent, const std::string& engine_name);
-
   void ModelUpdate(CLI::App* parent);
 
   CLI::App app_;
@@ -42,7 +32,6 @@ class CommandLineParser {
   std::shared_ptr<cortex::DylibPathManager> dylib_path_manager_;
   std::shared_ptr<DatabaseService> db_service_;
   std::shared_ptr<EngineService> engine_service_;
-  std::vector<std::string> supported_engines_;
 
   struct CmlData {
     std::string model_id;

From 13652ca8ca77403074ff4c55488b548e6c4b55b5 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 17 Mar 2025 15:02:43 +0800
Subject: [PATCH 49/73] init vllm engine

---
 engine/CMakeLists.txt                         |   3 +-
 engine/cli/CMakeLists.txt                     |   5 +-
 engine/cli/commands/engine_install_cmd.cc     |   2 +-
 .../extensions/python-engine/python_engine.cc | 447 ------------------
 .../extensions/python-engine/python_engine.h  |  60 ---
 .../extensions/python-engines/python_utils.cc | 151 ++++++
 .../extensions/python-engines/python_utils.h  |  33 ++
 .../extensions/python-engines/vllm_engine.cc  | 132 ++++++
 .../extensions/python-engines/vllm_engine.h   |  61 +++
 engine/services/engine_service.cc             |  73 ++-
 engine/services/model_service.cc              |   8 -
 engine/utils/engine_constants.h               |   1 +
 12 files changed, 420 insertions(+), 556 deletions(-)
 delete mode 100644 engine/extensions/python-engine/python_engine.cc
 delete mode 100644 engine/extensions/python-engine/python_engine.h
 create mode 100644 engine/extensions/python-engines/python_utils.cc
 create mode 100644 engine/extensions/python-engines/python_utils.h
 create mode 100644 engine/extensions/python-engines/vllm_engine.cc
 create mode 100644 engine/extensions/python-engines/vllm_engine.h

diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt
index 3f08f83e0..9694db8f3 100644
--- a/engine/CMakeLists.txt
+++ b/engine/CMakeLists.txt
@@ -172,7 +172,8 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/file_logger.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/extensions/template_renderer.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engine/python_engine.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engines/python_utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engines/vllm_engine.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/dylib_path_manager.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc
diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt
index 0162c1f56..9dc2b4980 100644
--- a/engine/cli/CMakeLists.txt
+++ b/engine/cli/CMakeLists.txt
@@ -86,8 +86,9 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/hardware_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc
-    
-    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engine/python_engine.cc
+
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engines/python_utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engines/vllm_engine.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc
diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc
index d3fdf8b9b..3f72a1980 100644
--- a/engine/cli/commands/engine_install_cmd.cc
+++ b/engine/cli/commands/engine_install_cmd.cc
@@ -11,7 +11,7 @@ namespace commands {
 // NOTE: should have a single source of truth between CLI and server
 static bool NeedCudaDownload(const std::string& engine) {
   return !system_info_utils::GetDriverAndCudaVersion().second.empty() &&
-         engine != kPythonEngine;
+         engine == kLlamaRepo;
 }
 
 bool EngineInstallCmd::Exec(const std::string& engine,
diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc
deleted file mode 100644
index d9b3ae485..000000000
--- a/engine/extensions/python-engine/python_engine.cc
+++ /dev/null
@@ -1,447 +0,0 @@
-#include "python_engine.h"
-#include <filesystem>
-
-#include "config/model_config.h"
-#include "utils/archive_utils.h"
-#include "utils/file_manager_utils.h"
-#include "utils/set_permission_utils.h"
-#include "utils/system_info_utils.h"
-
-namespace python_engine {
-namespace {
-constexpr const int k200OK = 200;
-constexpr const int k400BadRequest = 400;
-constexpr const int k409Conflict = 409;
-constexpr const int k500InternalServerError = 500;
-}  // namespace
-
-std::filesystem::path GetPythonEnginePath() {
-  return file_manager_utils::GetCortexDataPath() / "python_engine";
-}
-
-cpp::result<void, std::string> DownloadUv(
-    std::shared_ptr<DownloadService>& download_service) {
-  const auto py_bin_path =
-      file_manager_utils::GetCortexDataPath() / "python_engine" / "bin";
-  std::filesystem::create_directories(py_bin_path);
-
-  // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
-  const std::string uv_version = "0.6.3";
-
-  // build download url based on system info
-  std::stringstream fname_stream;
-  fname_stream << "uv-";
-
-  auto system_info = system_info_utils::GetSystemInfo();
-  if (system_info->arch == "amd64")
-    fname_stream << "x86_64";
-  else if (system_info->arch == "arm64")
-    fname_stream << "aarch64";
-
-  // NOTE: there is also a musl linux version
-  if (system_info->os == kMacOs)
-    fname_stream << "-apple-darwin.tar.gz";
-  else if (system_info->os == kWindowsOs)
-    fname_stream << "-pc-windows-msvc.zip";
-  else if (system_info->os == kLinuxOs)
-    fname_stream << "-unknown-linux-gnu.tar.gz";
-
-  const std::string fname = fname_stream.str();
-  const std::string base_url =
-      "https://github.com/astral-sh/uv/releases/download/";
-
-  std::stringstream url_stream;
-  url_stream << base_url << uv_version << "/" << fname;
-  const std::string url = url_stream.str();
-  CTL_INF("Download uv from " << url);
-
-  auto on_finished = [py_bin_path,
-                      uv_version](const DownloadTask& finishedTask) {
-    // try to unzip the downloaded file
-    const std::string download_path = finishedTask.items[0].localPath.string();
-
-    archive_utils::ExtractArchive(download_path, py_bin_path.string(), true);
-    set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
-    std::filesystem::remove(download_path);
-
-    // install Python3.10 from Astral. this will be preferred over system
-    // Python when possible.
-    // NOTE: currently this will install to a user-wide directory. we can
-    // install to a specific location using `--install-dir`, but later
-    // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use
-    // this Python installation.
-    // we can add this once we allow passing custom env var to SpawnProcess().
-    // https://docs.astral.sh/uv/reference/cli/#uv-python-install
-    std::vector<std::string> command = BuildUvCommand("python");
-    command.push_back("install");
-    command.push_back("3.10");
-
-    // NOTE: errors in download callback won't be propagated to caller
-    auto result = cortex::process::SpawnProcess(command);
-    if (result.has_error()) {
-      CTL_ERR(result.error());
-      return;
-    }
-
-    if (!cortex::process::WaitProcess(result.value())) {
-      CTL_ERR("Process spawned but fail to wait");
-      return;
-    }
-  };
-
-  auto downloadTask = DownloadTask{.id = "python-uv",
-                                   .type = DownloadType::Engine,
-                                   .items = {DownloadItem{
-                                       .id = "python-uv",
-                                       .downloadUrl = url,
-                                       .localPath = py_bin_path / fname,
-                                   }}};
-
-  auto add_task_result = download_service->AddTask(downloadTask, on_finished);
-  if (add_task_result.has_error()) {
-    return cpp::fail(add_task_result.error());
-  }
-  return {};
-}
-
-std::filesystem::path GetUvPath() {
-  auto system_info = system_info_utils::GetSystemInfo();
-  const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
-  return GetPythonEnginePath() / "bin" / bin_name;
-}
-
-std::vector<std::string> BuildUvCommand(const std::string& action,
-                                        const std::string& directory) {
-  // use our own cache dir so that when users delete cortexcpp/, everything is deleted.
-  const auto cache_dir = GetPythonEnginePath() / "cache" / "uv";
-  std::vector<std::string> command = {GetUvPath().string(), "--cache-dir",
-                                      cache_dir.string()};
-  if (!directory.empty()) {
-    command.push_back("--directory");
-    command.push_back(directory);
-  }
-  command.push_back(action);
-  return command;
-}
-
-bool IsUvInstalled() {
-  return std::filesystem::exists(GetUvPath());
-}
-
-cpp::result<void, std::string> UvDownloadDeps(
-    const std::filesystem::path& model_dir) {
-  if (!IsUvInstalled())
-    return cpp::fail(
-        "uv is not installed. Please run `cortex engines install python`.");
-
-  std::vector<std::string> command = BuildUvCommand("sync", model_dir.string());
-
-  // script mode. 1st argument is path to .py script
-  if (!std::filesystem::exists(model_dir / "pyproject.toml")) {
-    config::PythonModelConfig py_cfg;
-    py_cfg.ReadFromYaml((model_dir / "model.yml").string());
-    command.push_back("--script");
-    command.push_back(py_cfg.entrypoint[0]);
-  }
-
-  auto result = cortex::process::SpawnProcess(command);
-  if (result.has_error())
-    return cpp::fail("Fail to install Python dependencies. " + result.error());
-
-  if (!cortex::process::WaitProcess(result.value())) {
-    return cpp::fail("Fail to install Python dependencies.");
-  }
-
-  return {};
-}
-
-bool PythonEngine::PythonSubprocess::IsAlive() {
-  return cortex::process::IsProcessAlive(proc_info);
-}
-bool PythonEngine::PythonSubprocess::Kill() {
-  return cortex::process::KillProcess(proc_info);
-}
-
-PythonEngine::PythonEngine() {}
-
-PythonEngine::~PythonEngine() {
-  // NOTE: what happens if we can't kill subprocess?
-  std::unique_lock write_lock(mutex);
-  for (auto& [model_name, py_proc] : model_process_map) {
-    if (py_proc.IsAlive())
-      py_proc.Kill();
-  }
-}
-
-static std::pair<Json::Value, Json::Value> CreateResponse(
-    const std::string& msg, int code) {
-
-  Json::Value status, res;
-  const bool has_error = code != k200OK;
-
-  status["is_done"] = true;
-  status["has_error"] = has_error;
-  status["is_stream"] = false;
-  status["status_code"] = code;
-
-  if (has_error) {
-    CTL_ERR(msg);
-    res["error"] = msg;
-  } else {
-    res["status"] = msg;
-  }
-
-  return {status, res};
-}
-
-void PythonEngine::LoadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  if (!json_body->isMember("model") || !json_body->isMember("model_dir")) {
-    auto [status, error] = CreateResponse(
-        "Missing required fields: model or model_dir", k400BadRequest);
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  namespace fs = std::filesystem;
-
-  const std::string model = (*json_body)["model"].asString();
-  const fs::path model_dir = (*json_body)["model_dir"].asString();
-
-  {
-    std::unique_lock write_lock(mutex);
-    if (model_process_map.find(model) != model_process_map.end()) {
-      // check if model is still alive
-      if (model_process_map[model].IsAlive()) {
-        auto [status, error] =
-            CreateResponse("Model already loaded!", k409Conflict);
-        callback(std::move(status), std::move(error));
-        return;
-      } else {
-        // if model has exited, try to load model again
-        CTL_WRN("Model " << model << " has exited unexpectedly");
-        model_process_map.erase(model);
-      }
-    }
-  }
-
-  pid_t pid;
-  try {
-    config::PythonModelConfig py_cfg;
-    py_cfg.ReadFromYaml((model_dir / "model.yml").string());
-
-    if (py_cfg.entrypoint.empty()) {
-      throw std::runtime_error("Missing entrypoint in model.yml");
-    }
-
-    // https://docs.astral.sh/uv/reference/cli/#uv-run
-    std::vector<std::string> command =
-        BuildUvCommand("run", model_dir.string());
-    for (const auto& item : py_cfg.entrypoint)
-      command.push_back(item);
-
-    const std::string stdout_path = (model_dir / "stdout.txt").string();
-    const std::string stderr_path = (model_dir / "stderr.txt").string();
-
-    // create empty stdout.txt and stderr.txt for redirection
-    if (!std::filesystem::exists(stdout_path))
-      std::ofstream(stdout_path).flush();
-    if (!std::filesystem::exists(stderr_path))
-      std::ofstream(stderr_path).flush();
-
-    auto result =
-        cortex::process::SpawnProcess(command, stdout_path, stderr_path);
-    if (result.has_error()) {
-      throw std::runtime_error(result.error());
-    }
-
-    PythonSubprocess py_proc;
-    py_proc.proc_info = result.value();
-    py_proc.port = py_cfg.port;
-    py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() /
-                         std::chrono::milliseconds(1);
-
-    pid = py_proc.proc_info.pid;
-
-    std::unique_lock write_lock(mutex);
-    model_process_map[model] = py_proc;
-
-  } catch (const std::exception& e) {
-    auto e_msg = e.what();
-    auto [status, error] = CreateResponse(e_msg, k500InternalServerError);
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  auto [status, res] = CreateResponse(
-      "Model loaded successfully with pid: " + std::to_string(pid), k200OK);
-  callback(std::move(status), std::move(res));
-}
-
-void PythonEngine::UnloadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  if (!json_body->isMember("model")) {
-    auto [status, error] =
-        CreateResponse("Missing required field: model", k400BadRequest);
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  const std::string model = (*json_body)["model"].asString();
-
-  // check if model has started
-  {
-    std::shared_lock read_lock(mutex);
-    if (model_process_map.find(model) == model_process_map.end()) {
-      const std::string msg = "Model " + model + " has not been loaded yet.";
-      auto [status, error] = CreateResponse(msg, k400BadRequest);
-      callback(std::move(status), std::move(error));
-      return;
-    }
-  }
-
-  // we know that model has started
-  {
-    std::unique_lock write_lock(mutex);
-
-    // check if subprocess is still alive
-    // NOTE: is this step necessary? the subprocess could have terminated
-    // after .IsAlive() and before .Kill() later.
-    if (!model_process_map[model].IsAlive()) {
-      model_process_map.erase(model);
-      const std::string msg = "Model " + model + " stopped running.";
-      auto [status, error] = CreateResponse(msg, k400BadRequest);
-      callback(std::move(status), std::move(error));
-      return;
-    }
-
-    // subprocess is alive. we kill it here.
-    if (!model_process_map[model].Kill()) {
-      const std::string msg = "Unable to kill process of model " + model;
-      auto [status, error] = CreateResponse(msg, k500InternalServerError);
-      callback(std::move(status), std::move(error));
-      return;
-    }
-
-    model_process_map.erase(model);
-  }
-
-  auto [status, res] = CreateResponse("Unload model successfully", k200OK);
-  callback(std::move(status), std::move(res));
-}
-
-void PythonEngine::GetModelStatus(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  if (!json_body->isMember("model")) {
-    auto [status, error] =
-        CreateResponse("Missing required field: model", k400BadRequest);
-    callback(std::move(status), std::move(error));
-    return;
-  }
-
-  const std::string model = (*json_body)["model"].asString();
-  Json::Value res, status;
-
-  // check if model has started
-  {
-    std::shared_lock read_lock(mutex);
-    if (model_process_map.find(model) == model_process_map.end()) {
-      const std::string msg = "Model " + model + " has not been loaded yet.";
-      auto [status, error] = CreateResponse(msg, k400BadRequest);
-      callback(std::move(status), std::move(error));
-      return;
-    }
-  }
-
-  // we know that model has started
-  {
-    std::unique_lock write_lock(mutex);
-
-    // check if subprocess is still alive
-    if (!model_process_map[model].IsAlive()) {
-      CTL_WRN("Model " << model << " has exited unexpectedly.");
-      model_process_map.erase(model);
-      const std::string msg = "Model " + model + " stopped running.";
-      auto [status, error] = CreateResponse(msg, k400BadRequest);
-      callback(std::move(status), std::move(error));
-      return;
-    }
-  }
-
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-  callback(std::move(status), std::move(res));
-}
-
-void PythonEngine::GetModels(
-    std::shared_ptr<Json::Value> jsonBody,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  Json::Value res, model_list(Json::arrayValue), status;
-  {
-    std::unique_lock write_lock(mutex);
-    for (auto& [model_name, py_proc] : model_process_map) {
-      if (!py_proc.IsAlive()) {
-        CTL_WRN("Model " << model_name << " has exited unexpectedly.");
-        model_process_map.erase(model_name);
-        continue;
-      }
-
-      Json::Value val;
-      val["id"] = model_name;
-      val["engine"] = kPythonEngine;
-      val["start_time"] = py_proc.start_time;
-      val["port"] = py_proc.port;
-      val["object"] = "model";
-      // TODO
-      // val["ram"];
-      // val["vram"];
-      model_list.append(val);
-    }
-  }
-
-  res["object"] = "list";
-  res["data"] = model_list;
-
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-
-  callback(std::move(status), std::move(res));
-}
-
-cpp::result<int, std::string> PythonEngine::GetPort(const std::string& model) {
-  int port;
-
-  // check if model has started
-  {
-    std::shared_lock read_lock(mutex);
-    if (model_process_map.find(model) == model_process_map.end()) {
-      return cpp::fail("Model " + model + " has not been loaded yet.");
-    }
-    port = model_process_map[model].port;
-  }
-
-  // check if subprocess is still alive
-  {
-    std::unique_lock write_lock(mutex);
-    if (!model_process_map[model].IsAlive()) {
-      CTL_WRN("Model " << model << " has exited unexpectedly.");
-      model_process_map.erase(model);
-      return cpp::fail("Model " + model + " stopped running.");
-    }
-  }
-
-  return port;
-}
-
-}  // namespace python_engine
diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h
deleted file mode 100644
index b7d207921..000000000
--- a/engine/extensions/python-engine/python_engine.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <json/json.h>
-#include <filesystem>
-#include <shared_mutex>
-#include <string>
-#include <unordered_map>
-
-#include "cortex-common/python_enginei.h"
-#include "services/download_service.h"
-#include "utils/process/utils.h"
-
-namespace python_engine {
-
-std::filesystem::path GetPythonEnginePath();
-
-// UV-related functions
-cpp::result<void, std::string> DownloadUv(
-    std::shared_ptr<DownloadService>& download_service);
-std::filesystem::path GetUvPath();
-std::vector<std::string> BuildUvCommand(const std::string& action,
-                                        const std::string& directory = "");
-bool IsUvInstalled();
-cpp::result<void, std::string> UvDownloadDeps(
-    const std::filesystem::path& yaml_path);
-
-class PythonEngine : public PythonEngineI {
- private:
-  struct PythonSubprocess {
-    cortex::process::ProcessInfo proc_info;
-    int port;
-    uint64_t start_time;
-
-    bool IsAlive();
-    bool Kill();
-  };
-
-  mutable std::shared_mutex mutex;
-  std::unordered_map<std::string, PythonSubprocess> model_process_map;
-
- public:
-  PythonEngine();
-  ~PythonEngine();
-
-  void LoadModel(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  void UnloadModel(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  void GetModelStatus(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  void GetModels(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-
-  cpp::result<int, std::string> GetPort(const std::string& model) override;
-};
-}  // namespace python_engine
diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc
new file mode 100644
index 000000000..5255fcd0e
--- /dev/null
+++ b/engine/extensions/python-engines/python_utils.cc
@@ -0,0 +1,151 @@
+#include "python_utils.h"
+#include <filesystem>
+
+#include "utils/archive_utils.h"
+#include "utils/file_manager_utils.h"
+#include "utils/set_permission_utils.h"
+#include "utils/system_info_utils.h"
+
+namespace python_utils {
+
+std::filesystem::path GetPythonEnginesPath() {
+  return file_manager_utils::GetCortexDataPath() / "python_engines";
+}
+std::filesystem::path GetEnvsPath() {
+  return GetPythonEnginesPath() / "envs";
+}
+std::filesystem::path GetUvPath() {
+  auto system_info = system_info_utils::GetSystemInfo();
+  const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
+  return GetPythonEnginesPath() / "bin" / bin_name;
+}
+
+bool IsUvInstalled() {
+  return std::filesystem::exists(GetUvPath());
+}
+cpp::result<void, std::string> InstallUv(
+    std::shared_ptr<DownloadService>& download_service) {
+  const auto py_bin_path = GetPythonEnginesPath() / "bin";
+  std::filesystem::create_directories(py_bin_path);
+
+  // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
+  const std::string uv_version = "0.6.3";
+
+  // build download url based on system info
+  std::stringstream fname_stream;
+  fname_stream << "uv-";
+
+  auto system_info = system_info_utils::GetSystemInfo();
+  if (system_info->arch == "amd64")
+    fname_stream << "x86_64";
+  else if (system_info->arch == "arm64")
+    fname_stream << "aarch64";
+
+  // NOTE: there is also a musl linux version
+  if (system_info->os == kMacOs)
+    fname_stream << "-apple-darwin.tar.gz";
+  else if (system_info->os == kWindowsOs)
+    fname_stream << "-pc-windows-msvc.zip";
+  else if (system_info->os == kLinuxOs)
+    fname_stream << "-unknown-linux-gnu.tar.gz";
+
+  const std::string fname = fname_stream.str();
+  const std::string base_url =
+      "https://github.com/astral-sh/uv/releases/download/";
+
+  std::stringstream url_stream;
+  url_stream << base_url << uv_version << "/" << fname;
+  const std::string url = url_stream.str();
+  CTL_INF("Download uv from " << url);
+
+  auto on_finished = [py_bin_path,
+                      uv_version](const DownloadTask& finishedTask) {
+    // try to unzip the downloaded file
+    const std::string download_path = finishedTask.items[0].localPath.string();
+
+    archive_utils::ExtractArchive(download_path, py_bin_path.string(), true);
+    set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
+    std::filesystem::remove(download_path);
+
+    // install Python3.10 from Astral. this will be preferred over system
+    // Python when possible.
+    // NOTE: currently this will install to a user-wide directory. we can
+    // install to a specific location using `--install-dir`, but later
+    // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use
+    // this Python installation.
+    // we can add this once we allow passing custom env var to SpawnProcess().
+    // https://docs.astral.sh/uv/reference/cli/#uv-python-install
+    std::vector<std::string> command = BuildUvCommand("python");
+    command.push_back("install");
+    command.push_back("3.10");
+
+    // NOTE: errors in download callback won't be propagated to caller
+    auto result = cortex::process::SpawnProcess(command);
+    if (result.has_error()) {
+      CTL_ERR(result.error());
+      return;
+    }
+
+    if (!cortex::process::WaitProcess(result.value())) {
+      CTL_ERR("Process spawned but fail to wait");
+      return;
+    }
+  };
+
+  auto downloadTask = DownloadTask{.id = "python-uv",
+                                   .type = DownloadType::Engine,
+                                   .items = {DownloadItem{
+                                       .id = "python-uv",
+                                       .downloadUrl = url,
+                                       .localPath = py_bin_path / fname,
+                                   }}};
+
+  auto add_task_result = download_service->AddTask(downloadTask, on_finished);
+  if (add_task_result.has_error()) {
+    return cpp::fail(add_task_result.error());
+  }
+  return {};
+}
+
+std::vector<std::string> BuildUvCommand(const std::string& action,
+                                        const std::string& directory) {
+  // use our own cache dir so that when users delete cortexcpp/, everything is deleted.
+  const auto cache_dir = GetPythonEnginesPath() / "cache" / "uv";
+  std::vector<std::string> command = {GetUvPath().string(), "--cache-dir",
+                                      cache_dir.string()};
+  if (!directory.empty()) {
+    command.push_back("--directory");
+    command.push_back(directory);
+  }
+  command.push_back(action);
+  return command;
+}
+
+// cpp::result<void, std::string> UvDownloadDeps(
+//     const std::filesystem::path& model_dir) {
+//   if (!IsUvInstalled())
+//     return cpp::fail(
+//         "uv is not installed. Please run `cortex engines install python`.");
+
+//   std::vector<std::string> command = BuildUvCommand("sync", model_dir.string());
+
+//   // script mode. 1st argument is path to .py script
+//   if (!std::filesystem::exists(model_dir / "pyproject.toml")) {
+//     config::PythonModelConfig py_cfg;
+//     py_cfg.ReadFromYaml((model_dir / "model.yml").string());
+//     command.push_back("--script");
+//     command.push_back(py_cfg.entrypoint[0]);
+//   }
+
+//   auto result = cortex::process::SpawnProcess(command);
+//   if (result.has_error())
+//     return cpp::fail("Fail to install Python dependencies. " + result.error());
+
+//   if (!cortex::process::WaitProcess(result.value())) {
+//     return cpp::fail("Fail to install Python dependencies.");
+//   }
+
+//   return {};
+// }
+
+}  // namespace python_utils
diff --git a/engine/extensions/python-engines/python_utils.h b/engine/extensions/python-engines/python_utils.h
new file mode 100644
index 000000000..31b0ca0ad
--- /dev/null
+++ b/engine/extensions/python-engines/python_utils.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <filesystem>
+#include <string>
+
+#include "services/download_service.h"
+#include "utils/process/utils.h"
+
+namespace python_utils {
+
+// paths
+std::filesystem::path GetPythonEnginesPath();
+std::filesystem::path GetEnvsPath();
+std::filesystem::path GetUvPath();
+
+// UV-related functions
+bool IsUvInstalled();
+cpp::result<void, std::string> InstallUv(
+    std::shared_ptr<DownloadService>& download_service);
+std::vector<std::string> BuildUvCommand(const std::string& action,
+                                        const std::string& directory = "");
+// cpp::result<void, std::string> UvDownloadDeps(
+//     const std::filesystem::path& yaml_path);
+
+struct PythonSubprocess {
+  cortex::process::ProcessInfo proc_info;
+  int port;
+  uint64_t start_time;
+
+  bool IsAlive() { return cortex::process::IsProcessAlive(proc_info); }
+  bool Kill() { return cortex::process::KillProcess(proc_info); }
+};
+}  // namespace python_utils
diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
new file mode 100644
index 000000000..f2c1d6c26
--- /dev/null
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -0,0 +1,132 @@
+#include "vllm_engine.h"
+#include "utils/curl_utils.h"
+#include "utils/logging_utils.h"
+
+namespace {
+cpp::result<std::string, std::string> GetLatestVllmVersion() {
+  auto result = curl_utils::SimpleGetJson("https://pypi.org/pypi/vllm/json");
+  if (result.has_error())
+    return result.error();
+
+  auto version_value = result.value()["info"]["version"];
+  if (version_value.isNull())
+    return cpp::fail("Can't find version in the response");
+
+  return version_value.asString();
+}
+}  // namespace
+
+VllmEngine::~VllmEngine() {
+  // NOTE: what happens if we can't kill subprocess?
+  std::unique_lock write_lock(mutex);
+  for (auto& [model_name, py_proc] : model_process_map) {
+    if (py_proc.IsAlive())
+      py_proc.Kill();
+  }
+}
+
+cpp::result<void, std::string> VllmEngine::Download(
+    std::shared_ptr<DownloadService>& download_service,
+    const std::string& version, const std::optional<std::string> variant_name) {
+  if (variant_name.has_value()) {
+    return cpp::fail("variant_name must be empty");
+  }
+
+  if (!python_utils::IsUvInstalled()) {
+    auto result = python_utils::InstallUv(download_service);
+    if (result.has_error())
+      return result;
+  }
+
+  std::string concrete_version = version;
+  if (version == "latest") {
+    auto result = GetLatestVllmVersion();
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    concrete_version = result.value();
+  }
+  CTL_INF("Download vLLM " << concrete_version);
+
+  const auto vllm_path =
+      python_utils::GetEnvsPath() / "vllm" / concrete_version;
+  std::filesystem::create_directories(vllm_path);
+  const auto vllm_path_str = vllm_path.string();
+
+  {
+    // initialize venv
+    std::vector<std::string> cmd =
+        python_utils::BuildUvCommand("venv", vllm_path_str);
+    auto result = cortex::process::SpawnProcess(cmd);
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    // TODO: check return code
+    // NOTE: these are not async
+    cortex::process::WaitProcess(result.value());
+  }
+  {
+    // install vLLM
+    std::vector<std::string> cmd =
+        python_utils::BuildUvCommand("pip", vllm_path_str);
+    cmd.push_back("install");
+    cmd.push_back("vllm==" + concrete_version);
+    auto result = cortex::process::SpawnProcess(cmd);
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    // TODO: check return code
+    // NOTE: these are not async
+    cortex::process::WaitProcess(result.value());
+  }
+
+  return {};
+}
+
+void VllmEngine::Load(EngineLoadOption opts) {};
+void VllmEngine::Unload(EngineUnloadOption opts) {};
+
+// cortex.llamacpp interface
+void VllmEngine::HandleChatCompletion(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+void VllmEngine::HandleEmbedding(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+void VllmEngine::LoadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+void VllmEngine::UnloadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+void VllmEngine::GetModelStatus(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
+// For backward compatible checking
+bool VllmEngine::IsSupported(const std::string& f) {
+  return true;
+};
+
+// Get list of running models
+void VllmEngine::GetModels(
+    std::shared_ptr<Json::Value> jsonBody,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
+bool VllmEngine::SetFileLogger(int max_log_lines, const std::string& log_path) {
+  return true;
+};
+void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {};
+
+// Stop inflight chat completion in stream mode
+void VllmEngine::StopInferencing(const std::string& model_id) {};
+
+Json::Value VllmEngine::GetRemoteModels() {
+  return Json::Value{};
+};
+void VllmEngine::HandleRouteRequest(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+void VllmEngine::HandleInference(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
new file mode 100644
index 000000000..261da1025
--- /dev/null
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -0,0 +1,61 @@
+#include "cortex-common/EngineI.h"
+#include "python_utils.h"
+
+class VllmEngine : public EngineI {
+ private:
+  mutable std::shared_mutex mutex;
+  std::unordered_map<std::string, python_utils::PythonSubprocess>
+      model_process_map;
+
+ public:
+  VllmEngine() {};
+  ~VllmEngine();
+
+  static cpp::result<void, std::string> Download(
+      std::shared_ptr<DownloadService>& download_service,
+      const std::string& version,
+      const std::optional<std::string> variant_name);
+
+  virtual void Load(EngineLoadOption opts) override;
+  virtual void Unload(EngineUnloadOption opts) override;
+
+  // cortex.llamacpp interface
+  virtual void HandleChatCompletion(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  virtual void HandleEmbedding(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  virtual void LoadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  virtual void UnloadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  virtual void GetModelStatus(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+
+  // For backward compatible checking
+  virtual bool IsSupported(const std::string& f) override;
+
+  // Get list of running models
+  virtual void GetModels(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+
+  virtual bool SetFileLogger(int max_log_lines,
+                             const std::string& log_path) override;
+  virtual void SetLogLevel(trantor::Logger::LogLevel logLevel) override;
+
+  // Stop inflight chat completion in stream mode
+  virtual void StopInferencing(const std::string& model_id) override;
+
+  virtual Json::Value GetRemoteModels() override;
+  virtual void HandleRouteRequest(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+  virtual void HandleInference(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
+};
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index db12ea623..9056b6d5a 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -9,7 +9,7 @@
 #include "config/model_config.h"
 #include "database/engines.h"
 #include "database/models.h"
-#include "extensions/python-engine/python_engine.h"
+#include "extensions/python-engines/vllm_engine.h"
 #include "extensions/remote-engine/remote_engine.h"
 
 #include "utils/archive_utils.h"
@@ -18,11 +18,11 @@
 #include "utils/file_manager_utils.h"
 #include "utils/github_release_utils.h"
 #include "utils/logging_utils.h"
+#include "utils/normalize_engine.h"
 #include "utils/result.hpp"
 #include "utils/semantic_version_utils.h"
 #include "utils/system_info_utils.h"
 #include "utils/url_parser.h"
-#include "utils/normalize_engine.h"
 
 namespace {
 std::string GetSuitableCudaVersion(const std::string& engine,
@@ -187,7 +187,7 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
 
   // Python engine is stored in a separate folder
   if (ne == kPythonEngine) {
-    path_to_remove = python_engine::GetPythonEnginePath();
+    return cpp::fail("Not implemented");
   } else {
     if (version == std::nullopt && variant == std::nullopt) {
       // if no version and variant provided, remove all engines variant of that engine
@@ -228,9 +228,8 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
 
   if (engine == kLlamaRepo) {
     return DownloadLlamaCpp(version, variant_name);
-  } else if (engine == kPythonEngine) {
-    // ignore version and variant_name
-    return python_engine::DownloadUv(download_service_);
+  } else if (engine == kVllmEngine) {
+    return VllmEngine::Download(download_service_, version, variant_name);
   }
   return cpp::fail("Unknown engine " + engine);
 }
@@ -376,8 +375,8 @@ cpp::result<void, std::string> EngineService::DownloadLlamaCpp(
 
 cpp::result<bool, std::string> EngineService::DownloadCuda(
     const std::string& engine, bool async) {
-  if (hw_inf_.sys_inf->os == "mac" || engine == kPythonEngine) {
-    // mac and Python engine do not require cuda toolkit
+  if (hw_inf_.sys_inf->os == "mac" || engine != kLlamaRepo) {
+    // mac and non-llama.cpp engine do not require cuda toolkit
     return true;
   }
 
@@ -622,22 +621,22 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const {
   auto ne = cortex::engine::NormalizeEngine(engine);
   auto os = hw_inf_.sys_inf->os;
 
-  if (ne == kPythonEngine) {
-    if (!python_engine::IsUvInstalled()) {
-      return {};
-    } else {
-      // Python engine only means uv is installed.
-      // variant name and version don't quite make sense in this context.
-      // hence, they are left blank.
-      std::vector<EngineVariantResponse> variants;
-      variants.push_back(EngineVariantResponse{
-          .name = "",
-          .version = "",
-          .engine = kPythonEngine,
-      });
-      return variants;
-    }
-  }
+  // if (ne == kPythonEngine) {
+  //   if (!python_engine::IsUvInstalled()) {
+  //     return {};
+  //   } else {
+  //     // Python engine only means uv is installed.
+  //     // variant name and version don't quite make sense in this context.
+  //     // hence, they are left blank.
+  //     std::vector<EngineVariantResponse> variants;
+  //     variants.push_back(EngineVariantResponse{
+  //         .name = "",
+  //         .version = "",
+  //         .engine = kPythonEngine,
+  //     });
+  //     return variants;
+  //   }
+  // }
 
   auto engines_variants_dir =
       file_manager_utils::GetEnginesContainerPath() / ne;
@@ -705,11 +704,11 @@ cpp::result<void, std::string> EngineService::LoadEngine(
 
   // Check for python engine
 
-  if (engine_name == kPythonEngine) {
-    engines_[engine_name].engine = new python_engine::PythonEngine();
-    CTL_INF("Loaded engine: " << engine_name);
-    return {};
-  }
+  // if (engine_name == kPythonEngine) {
+  //   engines_[engine_name].engine = new python_engine::PythonEngine();
+  //   CTL_INF("Loaded engine: " << engine_name);
+  //   return {};
+  // }
 
   // Check for remote engine
   if (IsRemoteEngine(engine_name)) {
@@ -943,14 +942,14 @@ cpp::result<bool, std::string> EngineService::IsEngineReady(
   }
 
   // Check for python engine
-  if (engine == kPythonEngine) {
-    if (!python_engine::IsUvInstalled()) {
-      return cpp::fail(
-          "Python engine is not ready. Please run `cortex engines install "
-          "python`");
-    }
-    return true;
-  }
+  // if (engine == kPythonEngine) {
+  //   if (!python_engine::IsUvInstalled()) {
+  //     return cpp::fail(
+  //         "Python engine is not ready. Please run `cortex engines install "
+  //         "python`");
+  //   }
+  //   return true;
+  // }
 
   auto os = hw_inf_.sys_inf->os;
 
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 42a08f3b8..0b84f2b0c 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -14,7 +14,6 @@
 
 #include "services/inference_service.h"
 
-#include "extensions/python-engine/python_engine.h"
 #include "utils/cli_selection_utils.h"
 #include "utils/engine_constants.h"
 #include "utils/file_manager_utils.h"
@@ -544,13 +543,6 @@ ModelService::DownloadModelFromCortexsoAsync(
       yaml_handler.UpdateModelConfig(mc);
       yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
 
-    } else if (mc.engine == kPythonEngine) {
-      const auto model_dir = model_yml_item->localPath.parent_path();
-      auto result = python_engine::UvDownloadDeps(model_dir);
-      if (result.has_error()) {
-        CTL_ERR(result.error());
-        return;
-      }
     }
 
     auto rel =
diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h
index 8eeaa1946..3e1686e2f 100644
--- a/engine/utils/engine_constants.h
+++ b/engine/utils/engine_constants.h
@@ -2,6 +2,7 @@
 
 constexpr const auto kLlamaEngine = "llama-cpp";
 constexpr const auto kPythonEngine = "python-engine";
+constexpr const auto kVllmEngine = "vllm";
 
 constexpr const auto kOpenAiEngine = "openai";
 constexpr const auto kAnthropicEngine = "anthropic";

From 4d13014eb0921cedc9e4f8c53abc01c757157b49 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 17 Mar 2025 15:26:45 +0800
Subject: [PATCH 50/73] fix issues with progress streaming

---
 engine/cli/commands/engine_install_cmd.cc     | 28 ++++++++++++-------
 .../extensions/python-engines/vllm_engine.cc  | 13 +++++++--
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc
index 3f72a1980..bb1c7cec7 100644
--- a/engine/cli/commands/engine_install_cmd.cc
+++ b/engine/cli/commands/engine_install_cmd.cc
@@ -44,12 +44,16 @@ bool EngineInstallCmd::Exec(const std::string& engine,
     dp.Connect(host_, port_);
     bool need_cuda_download = NeedCudaDownload(engine);
     // engine can be small, so need to start ws first
-    auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] {
-      if (need_cuda_download) {
+    auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download, engine] {
+      // if (need_cuda_download) {
+      //   return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
+      // } else {
+      //   return dp.Handle({DownloadType::Engine});
+      // }
+      if (engine == kLlamaRepo)
         return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
-      } else {
-        return dp.Handle({DownloadType::Engine});
-      }
+      else
+        return dp.Handle({});
     });
 
     auto releases_url = url_parser::Url{
@@ -156,12 +160,16 @@ bool EngineInstallCmd::Exec(const std::string& engine,
   dp.Connect(host_, port_);
   bool need_cuda_download = NeedCudaDownload(engine);
   // engine can be small, so need to start ws first
-  auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] {
-    if (need_cuda_download) {
+  auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download, engine] {
+    // if (need_cuda_download) {
+    //   return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
+    // } else {
+    //   return dp.Handle({DownloadType::Engine});
+    // }
+    if (engine == kLlamaRepo)
       return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
-    } else {
-      return dp.Handle({DownloadType::Engine});
-    }
+    else
+      return dp.Handle({});
   });
 
   auto install_url = url_parser::Url{
diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index f2c1d6c26..b549f6c78 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -53,8 +53,8 @@ cpp::result<void, std::string> VllmEngine::Download(
   std::filesystem::create_directories(vllm_path);
   const auto vllm_path_str = vllm_path.string();
 
-  {
-    // initialize venv
+  // initialize venv
+  if (!std::filesystem::exists(vllm_path / ".venv")) {
     std::vector<std::string> cmd =
         python_utils::BuildUvCommand("venv", vllm_path_str);
     auto result = cortex::process::SpawnProcess(cmd);
@@ -65,8 +65,9 @@ cpp::result<void, std::string> VllmEngine::Download(
     // NOTE: these are not async
     cortex::process::WaitProcess(result.value());
   }
+
+  // install vLLM
   {
-    // install vLLM
     std::vector<std::string> cmd =
         python_utils::BuildUvCommand("pip", vllm_path_str);
     cmd.push_back("install");
@@ -90,15 +91,19 @@ void VllmEngine::Unload(EngineUnloadOption opts) {};
 void VllmEngine::HandleChatCompletion(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
 void VllmEngine::HandleEmbedding(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
 void VllmEngine::LoadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
 void VllmEngine::UnloadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
 void VllmEngine::GetModelStatus(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
@@ -124,9 +129,11 @@ void VllmEngine::StopInferencing(const std::string& model_id) {};
 Json::Value VllmEngine::GetRemoteModels() {
   return Json::Value{};
 };
+
 void VllmEngine::HandleRouteRequest(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+
 void VllmEngine::HandleInference(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};

From 591d4611a0884c2e5ba1226da941f475d2d82995 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Mar 2025 13:06:41 +0800
Subject: [PATCH 51/73] support download HF model

---
 engine/cli/commands/model_pull_cmd.cc |   9 +-
 engine/controllers/models.cc          |  39 +-----
 engine/services/model_service.cc      | 178 ++++++++++++++++++++------
 engine/services/model_service.h       |  25 ++--
 engine/utils/huggingface_utils.h      |   6 +
 5 files changed, 173 insertions(+), 84 deletions(-)

diff --git a/engine/cli/commands/model_pull_cmd.cc b/engine/cli/commands/model_pull_cmd.cc
index 75c0ce1a0..edd11b399 100644
--- a/engine/cli/commands/model_pull_cmd.cc
+++ b/engine/cli/commands/model_pull_cmd.cc
@@ -65,9 +65,14 @@ std::optional<std::string> ModelPullCmd::Exec(const std::string& host, int port,
   }
   auto download_url = res.value()["downloadUrl"].asString();
 
+  // TODO: when will these 2 be empty?
   if (downloaded.empty() && avails.empty()) {
-    model_id = id;
-    model = download_url;
+    if (res.value()["modelSource"].asString() == "huggingface") {
+      model = "hf:" + id;
+    } else {
+      model_id = id;
+      model = download_url;
+    }
   } else {
     if (is_cortexso) {
       auto selection = cli_selection_utils::PrintModelSelection(
diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
index a4a218143..0c1041abc 100644
--- a/engine/controllers/models.cc
+++ b/engine/controllers/models.cc
@@ -27,7 +27,7 @@ void Models::PullModel(const HttpRequestPtr& req,
     return;
   }
 
-  auto model_handle = (*(req->getJsonObject())).get("model", "").asString();
+  auto model_handle = req->getJsonObject()->get("model", "").asString();
   if (model_handle.empty()) {
     Json::Value ret;
     ret["result"] = "Bad Request";
@@ -38,48 +38,19 @@ void Models::PullModel(const HttpRequestPtr& req,
   }
 
   std::optional<std::string> desired_model_id = std::nullopt;
-  auto id = (*(req->getJsonObject())).get("id", "").asString();
+  auto id = req->getJsonObject()->get("id", "").asString();
   if (!id.empty()) {
     desired_model_id = id;
   }
 
   std::optional<std::string> desired_model_name = std::nullopt;
-  auto name_value = (*(req->getJsonObject())).get("name", "").asString();
-
+  auto name_value = req->getJsonObject()->get("name", "").asString();
   if (!name_value.empty()) {
     desired_model_name = name_value;
   }
 
-  auto handle_model_input =
-      [&, model_handle]() -> cpp::result<DownloadTask, std::string> {
-    CTL_INF("Handle model input, model handle: " + model_handle);
-    if (string_utils::StartsWith(model_handle, "https")) {
-      return model_service_->HandleDownloadUrlAsync(
-          model_handle, desired_model_id, desired_model_name);
-    } else if (model_handle.find(":") != std::string::npos) {
-      auto model_and_branch = string_utils::SplitBy(model_handle, ":");
-      if (model_and_branch.size() == 3) {
-        auto mh = url_parser::Url{
-            .protocol = "https",
-            .host = kHuggingFaceHost,
-            .pathParams = {
-                model_and_branch[0],
-                model_and_branch[1],
-                "resolve",
-                "main",
-                model_and_branch[2],
-            }}.ToFullPath();
-        return model_service_->HandleDownloadUrlAsync(mh, desired_model_id,
-                                                      desired_model_name);
-      }
-      return model_service_->DownloadModelFromCortexsoAsync(
-          model_and_branch[0], model_and_branch[1], desired_model_id);
-    }
-
-    return cpp::fail("Invalid model handle or not supported!");
-  };
-
-  auto result = handle_model_input();
+  auto result = model_service_->PullModel(model_handle, desired_model_id,
+                                          desired_model_name);
   if (result.has_error()) {
     Json::Value ret;
     ret["message"] = result.error();
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index e4fd44352..66f59ed7b 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -97,12 +97,14 @@ void ParseGguf(DatabaseService& db_service,
   }
 }
 
-cpp::result<DownloadTask, std::string> GetDownloadTask(
-    const std::string& modelId, const std::string& branch = "main") {
+cpp::result<DownloadTask, std::string> GetCloneRepoDownloadTask(
+    const std::string& author_id, const std::string& modelId,
+    const std::string& branch, const std::string& save_dir,
+    const std::string& task_id) {
   url_parser::Url url = {
       .protocol = "https",
       .host = kHuggingFaceHost,
-      .pathParams = {"api", "models", "cortexso", modelId, "tree", branch},
+      .pathParams = {"api", "models", author_id, modelId, "tree", branch},
   };
 
   auto result = curl_utils::SimpleGetJsonRecursive(url.ToFullPath());
@@ -112,7 +114,7 @@ cpp::result<DownloadTask, std::string> GetDownloadTask(
 
   std::vector<DownloadItem> download_items{};
   auto model_container_path = file_manager_utils::GetModelsContainerPath() /
-                              "cortex.so" / modelId / branch;
+                              save_dir / modelId / branch;
   file_manager_utils::CreateDirectoryRecursively(model_container_path.string());
 
   for (const auto& value : result.value()) {
@@ -125,7 +127,7 @@ cpp::result<DownloadTask, std::string> GetDownloadTask(
     url_parser::Url download_url = {
         .protocol = "https",
         .host = kHuggingFaceHost,
-        .pathParams = {"cortexso", modelId, "resolve", branch, path}};
+        .pathParams = {author_id, modelId, "resolve", branch, path}};
 
     auto local_path = model_container_path / path;
     if (!std::filesystem::exists(local_path.parent_path())) {
@@ -137,9 +139,8 @@ cpp::result<DownloadTask, std::string> GetDownloadTask(
                      .localPath = local_path});
   }
 
-  return DownloadTask{.id = branch == "main" ? modelId : modelId + "-" + branch,
-                      .type = DownloadType::Model,
-                      .items = download_items};
+  return DownloadTask{
+      .id = task_id, .type = DownloadType::Model, .items = download_items};
 }
 }  // namespace
 
@@ -298,6 +299,55 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
   return download_service_->AddTask(downloadTask, on_finished);
 }
 
+cpp::result<DownloadTask, std::string> ModelService::DownloadHfModelAsync(
+    const std::string& author_id, const std::string& model_id,
+    std::optional<std::string> temp_model_id) {
+
+  const std::string unique_model_id =
+      temp_model_id.value_or(author_id + ":" + model_id);
+  auto model_entry = db_service_->GetModelInfo(unique_model_id);
+  if (model_entry.has_value() &&
+      model_entry->status == cortex::db::ModelStatus::Downloaded)
+    return cpp::fail("Please delete the model before downloading again");
+
+  const std::string branch = "main";
+  auto download_task = GetCloneRepoDownloadTask(author_id, model_id, branch,
+                                                author_id, unique_model_id);
+  if (download_task.has_error())
+    return download_task;
+
+  auto on_finished = [&, this](const DownloadTask& finishedTask) {
+    if (!db_service_->HasModel(unique_model_id)) {
+      cortex::db::ModelEntry model_entry{
+          .model = unique_model_id,
+          .author_repo_id = author_id,
+          .branch_name = branch,
+          .path_to_model_yaml = "",
+          .model_alias = unique_model_id,
+          .status = cortex::db::ModelStatus::Downloaded,
+          .engine = kVllmEngine};
+
+      auto result = db_service_->AddModelEntry(model_entry);
+      if (result.has_error()) {
+        CTL_ERR("Error adding model to modellist: " + result.error());
+      }
+    } else {
+      if (auto m = db_service_->GetModelInfo(unique_model_id); m.has_value()) {
+        auto upd_m = m.value();
+        upd_m.status = cortex::db::ModelStatus::Downloaded;
+        if (auto r = db_service_->UpdateModelEntry(unique_model_id, upd_m);
+            r.has_error()) {
+          CTL_ERR(r.error());
+        }
+      } else {
+        CTL_WRN("Could not get model entry with model id: " << unique_model_id);
+      }
+    }
+  };
+
+  return download_service_->AddTask(download_task.value(), on_finished);
+}
+
 std::optional<hardware::Estimation> ModelService::GetEstimation(
     const std::string& model_handle) {
   std::lock_guard l(es_mtx_);
@@ -363,24 +413,19 @@ ModelService::DownloadModelFromCortexsoAsync(
     const std::string& name, const std::string& branch,
     std::optional<std::string> temp_model_id) {
 
-  auto download_task = GetDownloadTask(name, branch);
-  if (download_task.has_error()) {
-    return cpp::fail(download_task.error());
-  }
-
-  std::string unique_model_id = "";
-  if (temp_model_id.has_value()) {
-    unique_model_id = temp_model_id.value();
-  } else {
-    unique_model_id = name + ":" + branch;
-  }
-
+  std::string unique_model_id = temp_model_id.value_or(name + ":" + branch);
   auto model_entry = db_service_->GetModelInfo(unique_model_id);
   if (model_entry.has_value() &&
       model_entry->status == cortex::db::ModelStatus::Downloaded) {
     return cpp::fail("Please delete the model before downloading again");
   }
 
+  auto download_task = GetCloneRepoDownloadTask("cortexso", name, branch,
+                                                "cortex.so", unique_model_id);
+  if (download_task.has_error()) {
+    return cpp::fail(download_task.error());
+  }
+
   auto on_finished = [this, unique_model_id,
                       branch](const DownloadTask& finishedTask) {
     const DownloadItem* model_yml_item = nullptr;
@@ -415,7 +460,6 @@ ModelService::DownloadModelFromCortexsoAsync(
       mc.size = model_size;
       yaml_handler.UpdateModelConfig(mc);
       yaml_handler.WriteYamlFile(model_yml_item->localPath.string());
-
     }
 
     auto rel =
@@ -451,9 +495,7 @@ ModelService::DownloadModelFromCortexsoAsync(
     }
   };
 
-  auto task = download_task.value();
-  task.id = unique_model_id;
-  return download_service_->AddTask(task, on_finished);
+  return download_service_->AddTask(download_task.value(), on_finished);
 }
 
 cpp::result<void, std::string> ModelService::DeleteModel(
@@ -862,28 +904,38 @@ cpp::result<ModelPullInfo, std::string> ModelService::GetModelPullInfo(
           huggingface_utils::GetHuggingFaceModelRepoInfo(author, model_name);
 
       if (!repo_info.has_value()) {
-        return cpp::fail("Model not found");
+        return cpp::fail("Model not found on " + std::string{kHuggingFaceHost});
       }
 
-      if (!repo_info->gguf.has_value()) {
-        return cpp::fail(
-            "Not a GGUF model. Currently, only GGUF single file is "
-            "supported.");
+      // repo containing GGUF files
+      if (repo_info->gguf.has_value()) {
+        std::vector<std::string> options{};
+        for (const auto& sibling : repo_info->siblings) {
+          if (string_utils::EndsWith(sibling.rfilename, ".gguf")) {
+            options.push_back(sibling.rfilename);
+          }
+        }
+
+        return ModelPullInfo{
+            .id = author + ":" + model_name,
+            .downloaded_models = {},
+            .available_models = options,
+            .download_url =
+                huggingface_utils::GetDownloadableUrl(author, model_name, "")};
       }
 
-      std::vector<std::string> options{};
-      for (const auto& sibling : repo_info->siblings) {
-        if (string_utils::EndsWith(sibling.rfilename, ".gguf")) {
-          options.push_back(sibling.rfilename);
-        }
+      // repo that is supported by HF transformers
+      // we will download the whole repo
+      if (repo_info->library_name.value_or("") == "transformers") {
+        return ModelPullInfo{
+            .id = author + ":" + model_name,
+            .model_source = "huggingface",
+        };
       }
 
-      return ModelPullInfo{
-          .id = author + ":" + model_name,
-          .downloaded_models = {},
-          .available_models = options,
-          .download_url =
-              huggingface_utils::GetDownloadableUrl(author, model_name, "")};
+      return cpp::fail(
+          "Unsupported model. Currently, only GGUF models and HF models are "
+          "supported.");
     }
   }
   auto branches =
@@ -929,6 +981,52 @@ cpp::result<ModelPullInfo, std::string> ModelService::GetModelPullInfo(
                        .model_source = "cortexso"};
 }
 
+cpp::result<DownloadTask, std::string> ModelService::PullModel(
+    const std::string& model_handle,
+    const std::optional<std::string>& desired_model_id,
+    const std::optional<std::string>& desired_model_name) {
+  CTL_INF("Handle model input, model handle: " + model_handle);
+
+  if (string_utils::StartsWith(model_handle, "https"))
+    return HandleDownloadUrlAsync(model_handle, desired_model_id,
+                                  desired_model_name);
+
+  if (model_handle.find(":") == std::string::npos)
+    return cpp::fail("Invalid model handle or not supported!");
+
+  auto model_and_branch = string_utils::SplitBy(model_handle, ":");
+
+  // cortexso format - model:branch
+  if (model_and_branch.size() == 2)
+    return DownloadModelFromCortexsoAsync(
+        model_and_branch[0], model_and_branch[1], desired_model_id);
+
+  if (model_and_branch.size() == 3) {
+    // HF model
+    // hf:author_id:model_name
+    // NOTE: this may confuse with the format below, where author_id = "hf"
+    // https://huggingface.co/hf
+    if (model_and_branch[0] == "hf")
+      return DownloadHfModelAsync(model_and_branch[1], model_and_branch[2]);
+
+    // single GGUF file
+    // author_id:model_name:filename
+    auto mh = url_parser::Url{
+        .protocol = "https",
+        .host = kHuggingFaceHost,
+        .pathParams = {
+            model_and_branch[0],
+            model_and_branch[1],
+            "resolve",
+            "main",
+            model_and_branch[2],
+        }}.ToFullPath();
+    return HandleDownloadUrlAsync(mh, desired_model_id, desired_model_name);
+  }
+
+  return cpp::fail("Invalid model handle or not supported!");
+}
+
 cpp::result<std::string, std::string> ModelService::AbortDownloadModel(
     const std::string& task_id) {
   return download_service_->StopTask(task_id);
diff --git a/engine/services/model_service.h b/engine/services/model_service.h
index beba91f8c..3a8e32963 100644
--- a/engine/services/model_service.h
+++ b/engine/services/model_service.h
@@ -39,13 +39,14 @@ class ModelService {
                         std::shared_ptr<EngineServiceI> engine_svc,
                         cortex::TaskQueue& task_queue);
 
+  cpp::result<DownloadTask, std::string> PullModel(
+      const std::string& model_handle,
+      const std::optional<std::string>& desired_model_id,
+      const std::optional<std::string>& desired_model_name);
+
   cpp::result<std::string, std::string> AbortDownloadModel(
       const std::string& task_id);
 
-  cpp::result<DownloadTask, std::string> DownloadModelFromCortexsoAsync(
-      const std::string& name, const std::string& branch = "main",
-      std::optional<std::string> temp_model_id = std::nullopt);
-
   std::optional<config::ModelConfig> GetDownloadedModel(
       const std::string& modelId) const;
 
@@ -67,10 +68,6 @@ class ModelService {
   cpp::result<ModelPullInfo, std::string> GetModelPullInfo(
       const std::string& model_handle);
 
-  cpp::result<DownloadTask, std::string> HandleDownloadUrlAsync(
-      const std::string& url, std::optional<std::string> temp_model_id,
-      std::optional<std::string> temp_name);
-
   bool HasModel(const std::string& id) const;
 
   std::optional<hardware::Estimation> GetEstimation(
@@ -89,6 +86,18 @@ class ModelService {
   std::string GetEngineByModelId(const std::string& model_id) const;
 
  private:
+  cpp::result<DownloadTask, std::string> HandleDownloadUrlAsync(
+      const std::string& url, std::optional<std::string> temp_model_id,
+      std::optional<std::string> temp_name);
+
+  cpp::result<DownloadTask, std::string> DownloadModelFromCortexsoAsync(
+      const std::string& name, const std::string& branch = "main",
+      std::optional<std::string> temp_model_id = std::nullopt);
+
+  cpp::result<DownloadTask, std::string> DownloadHfModelAsync(
+      const std::string& author_id, const std::string& model_id,
+      std::optional<std::string> temp_model_id = std::nullopt);
+
   cpp::result<std::optional<std::string>, std::string> MayFallbackToCpu(
       const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048,
       int n_ubatch = 2048, const std::string& kv_cache_type = "f16");
diff --git a/engine/utils/huggingface_utils.h b/engine/utils/huggingface_utils.h
index 14c19084a..f98891b71 100644
--- a/engine/utils/huggingface_utils.h
+++ b/engine/utils/huggingface_utils.h
@@ -165,6 +165,7 @@ struct HuggingFaceModelRepoInfo {
   int downloads;
 
   int likes;
+  std::optional<std::string> library_name;
   std::optional<HuggingFaceGgufInfo> gguf;
   std::vector<HuggingFaceFileSibling> siblings;
   std::vector<std::string> spaces;
@@ -173,6 +174,10 @@ struct HuggingFaceModelRepoInfo {
 
   static cpp::result<HuggingFaceModelRepoInfo, std::string> FromJson(
       const Json::Value& body) {
+    std::optional<std::string> library_name = std::nullopt;
+    if (body["library_name"])
+      library_name = body["library_name"].asString();
+
     std::optional<HuggingFaceGgufInfo> gguf = std::nullopt;
     auto gguf_result = HuggingFaceGgufInfo::FromJson(body["gguf"]);
     if (gguf_result.has_value()) {
@@ -202,6 +207,7 @@ struct HuggingFaceModelRepoInfo {
         .downloads = body["downloads"].asInt(),
 
         .likes = body["likes"].asInt(),
+        .library_name = library_name,
         .gguf = gguf,
         .siblings = siblings,
         .spaces =

From c3d41bf893946b17ccb24319bb21e4e818f5e8f5 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Mar 2025 18:11:04 +0800
Subject: [PATCH 52/73] use / for HF model

---
 engine/cli/commands/model_pull_cmd.cc |  3 +-
 engine/services/model_service.cc      | 46 +++++++++++++++------------
 engine/services/model_service.h       |  3 +-
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/engine/cli/commands/model_pull_cmd.cc b/engine/cli/commands/model_pull_cmd.cc
index edd11b399..5e7ce045b 100644
--- a/engine/cli/commands/model_pull_cmd.cc
+++ b/engine/cli/commands/model_pull_cmd.cc
@@ -65,10 +65,9 @@ std::optional<std::string> ModelPullCmd::Exec(const std::string& host, int port,
   }
   auto download_url = res.value()["downloadUrl"].asString();
 
-  // TODO: when will these 2 be empty?
   if (downloaded.empty() && avails.empty()) {
     if (res.value()["modelSource"].asString() == "huggingface") {
-      model = "hf:" + id;
+      model = id;
     } else {
       model_id = id;
       model = download_url;
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 66f59ed7b..1751e52b2 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -99,7 +99,7 @@ void ParseGguf(DatabaseService& db_service,
 
 cpp::result<DownloadTask, std::string> GetCloneRepoDownloadTask(
     const std::string& author_id, const std::string& modelId,
-    const std::string& branch, const std::string& save_dir,
+    const std::string& branch, const std::vector<std::string>& save_dir,
     const std::string& task_id) {
   url_parser::Url url = {
       .protocol = "https",
@@ -113,8 +113,9 @@ cpp::result<DownloadTask, std::string> GetCloneRepoDownloadTask(
   }
 
   std::vector<DownloadItem> download_items{};
-  auto model_container_path = file_manager_utils::GetModelsContainerPath() /
-                              save_dir / modelId / branch;
+  auto model_container_path = file_manager_utils::GetModelsContainerPath();
+  for (auto subdir : save_dir)
+    model_container_path /= subdir;
   file_manager_utils::CreateDirectoryRecursively(model_container_path.string());
 
   for (const auto& value : result.value()) {
@@ -300,19 +301,18 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
 }
 
 cpp::result<DownloadTask, std::string> ModelService::DownloadHfModelAsync(
-    const std::string& author_id, const std::string& model_id,
-    std::optional<std::string> temp_model_id) {
+    const std::string& author_id, const std::string& model_id) {
 
-  const std::string unique_model_id =
-      temp_model_id.value_or(author_id + ":" + model_id);
+  const std::string unique_model_id = author_id + "/" + model_id;
   auto model_entry = db_service_->GetModelInfo(unique_model_id);
   if (model_entry.has_value() &&
       model_entry->status == cortex::db::ModelStatus::Downloaded)
     return cpp::fail("Please delete the model before downloading again");
 
   const std::string branch = "main";
-  auto download_task = GetCloneRepoDownloadTask(author_id, model_id, branch,
-                                                author_id, unique_model_id);
+  auto download_task = GetCloneRepoDownloadTask(
+      author_id, model_id, branch, {"huggingface.co", author_id, model_id},
+      unique_model_id);
   if (download_task.has_error())
     return download_task;
 
@@ -410,18 +410,20 @@ bool ModelService::HasModel(const std::string& id) const {
 
 cpp::result<DownloadTask, std::string>
 ModelService::DownloadModelFromCortexsoAsync(
-    const std::string& name, const std::string& branch,
+    const std::string& model_name, const std::string& branch,
     std::optional<std::string> temp_model_id) {
 
-  std::string unique_model_id = temp_model_id.value_or(name + ":" + branch);
+  std::string unique_model_id =
+      temp_model_id.value_or(model_name + ":" + branch);
   auto model_entry = db_service_->GetModelInfo(unique_model_id);
   if (model_entry.has_value() &&
       model_entry->status == cortex::db::ModelStatus::Downloaded) {
     return cpp::fail("Please delete the model before downloading again");
   }
 
-  auto download_task = GetCloneRepoDownloadTask("cortexso", name, branch,
-                                                "cortex.so", unique_model_id);
+  auto download_task = GetCloneRepoDownloadTask(
+      "cortexso", model_name, branch, {"cortex.so", model_name, branch},
+      unique_model_id);
   if (download_task.has_error()) {
     return cpp::fail(download_task.error());
   }
@@ -928,7 +930,7 @@ cpp::result<ModelPullInfo, std::string> ModelService::GetModelPullInfo(
       // we will download the whole repo
       if (repo_info->library_name.value_or("") == "transformers") {
         return ModelPullInfo{
-            .id = author + ":" + model_name,
+            .id = author + "/" + model_name,
             .model_source = "huggingface",
         };
       }
@@ -991,6 +993,15 @@ cpp::result<DownloadTask, std::string> ModelService::PullModel(
     return HandleDownloadUrlAsync(model_handle, desired_model_id,
                                   desired_model_name);
 
+  // HF model handle
+  if (model_handle.find("/") != std::string::npos) {
+    const auto author_model = string_utils::SplitBy(model_handle, "/");
+    if (author_model.size() != 2)
+      return cpp::fail("Invalid model handle");
+
+    return DownloadHfModelAsync(author_model[0], author_model[1]);
+  }
+
   if (model_handle.find(":") == std::string::npos)
     return cpp::fail("Invalid model handle or not supported!");
 
@@ -1002,13 +1013,6 @@ cpp::result<DownloadTask, std::string> ModelService::PullModel(
         model_and_branch[0], model_and_branch[1], desired_model_id);
 
   if (model_and_branch.size() == 3) {
-    // HF model
-    // hf:author_id:model_name
-    // NOTE: this may confuse with the format below, where author_id = "hf"
-    // https://huggingface.co/hf
-    if (model_and_branch[0] == "hf")
-      return DownloadHfModelAsync(model_and_branch[1], model_and_branch[2]);
-
     // single GGUF file
     // author_id:model_name:filename
     auto mh = url_parser::Url{
diff --git a/engine/services/model_service.h b/engine/services/model_service.h
index 3a8e32963..e61d17171 100644
--- a/engine/services/model_service.h
+++ b/engine/services/model_service.h
@@ -95,8 +95,7 @@ class ModelService {
       std::optional<std::string> temp_model_id = std::nullopt);
 
   cpp::result<DownloadTask, std::string> DownloadHfModelAsync(
-      const std::string& author_id, const std::string& model_id,
-      std::optional<std::string> temp_model_id = std::nullopt);
+      const std::string& author_id, const std::string& model_id);
 
   cpp::result<std::optional<std::string>, std::string> MayFallbackToCpu(
       const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048,

From dc42dddd5ad402a0c0a67e6ac8aba0c7ddb5ccba Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 18 Mar 2025 21:31:54 +0800
Subject: [PATCH 53/73] fix thread-unsafe

---
 engine/services/model_service.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 1751e52b2..fb188f4ae 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -309,19 +309,21 @@ cpp::result<DownloadTask, std::string> ModelService::DownloadHfModelAsync(
       model_entry->status == cortex::db::ModelStatus::Downloaded)
     return cpp::fail("Please delete the model before downloading again");
 
-  const std::string branch = "main";
   auto download_task = GetCloneRepoDownloadTask(
-      author_id, model_id, branch, {"huggingface.co", author_id, model_id},
+      author_id, model_id, "main", {"huggingface.co", author_id, model_id},
       unique_model_id);
   if (download_task.has_error())
     return download_task;
 
-  auto on_finished = [&, this](const DownloadTask& finishedTask) {
+  // TODO: validate that this is a vllm-compatible model
+  auto on_finished = [this, author_id,
+                      unique_model_id](const DownloadTask& finishedTask) {
     if (!db_service_->HasModel(unique_model_id)) {
+      CTL_INF("Before creating model entry");
       cortex::db::ModelEntry model_entry{
           .model = unique_model_id,
           .author_repo_id = author_id,
-          .branch_name = branch,
+          .branch_name = "main",
           .path_to_model_yaml = "",
           .model_alias = unique_model_id,
           .status = cortex::db::ModelStatus::Downloaded,

From 73fe3e5ed6eb635933a60c2e758fb01ea6d64624 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 10:32:34 +0800
Subject: [PATCH 54/73] remove methods

---
 engine/extensions/python-engines/vllm_engine.cc | 12 ------------
 engine/extensions/python-engines/vllm_engine.h  |  8 --------
 2 files changed, 20 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index b549f6c78..db291a4eb 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -125,15 +125,3 @@ void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {};
 
 // Stop inflight chat completion in stream mode
 void VllmEngine::StopInferencing(const std::string& model_id) {};
-
-Json::Value VllmEngine::GetRemoteModels() {
-  return Json::Value{};
-};
-
-void VllmEngine::HandleRouteRequest(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
-
-void VllmEngine::HandleInference(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index 261da1025..2fcfa0d74 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -50,12 +50,4 @@ class VllmEngine : public EngineI {
 
   // Stop inflight chat completion in stream mode
   virtual void StopInferencing(const std::string& model_id) override;
-
-  virtual Json::Value GetRemoteModels() override;
-  virtual void HandleRouteRequest(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  virtual void HandleInference(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 };

From 7bf287df5c3949429d8400aa3725ffdcc8d743dc Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 10:36:54 +0800
Subject: [PATCH 55/73] remove old remnants

---
 engine/controllers/server.h           |  2 --
 engine/cortex-common/python_enginei.h | 27 ---------------------------
 engine/services/engine_service.cc     |  9 +++------
 engine/services/inference_service.cc  | 16 ----------------
 4 files changed, 3 insertions(+), 51 deletions(-)
 delete mode 100644 engine/cortex-common/python_enginei.h

diff --git a/engine/controllers/server.h b/engine/controllers/server.h
index 0dd2d0913..7c8d759b4 100644
--- a/engine/controllers/server.h
+++ b/engine/controllers/server.h
@@ -43,8 +43,6 @@ class server : public drogon::HttpController<server, false>,
   ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Options, Post);
   ADD_METHOD_TO(server::Embedding, "/v1/embeddings", Options, Post);
 
-  ADD_METHOD_TO(server::Python, "/v1/python/{1}/.*", Options, Get, Post);
-
   METHOD_LIST_END
 
   void ChatCompletion(
diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h
deleted file mode 100644
index 35470f008..000000000
--- a/engine/cortex-common/python_enginei.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <functional>
-
-#include "json/value.h"
-#include "utils/result.hpp"
-
-class PythonEngineI {
- public:
-  virtual ~PythonEngineI() {}
-
-  // model management
-  virtual void LoadModel(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-  virtual void UnloadModel(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-  virtual void GetModelStatus(
-      std::shared_ptr<Json::Value> json_body,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-  virtual void GetModels(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-
-  virtual cpp::result<int, std::string> GetPort(const std::string& model) = 0;
-};
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 25fe1c7a3..e9b256cc3 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -185,10 +185,7 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
 
   std::optional<std::filesystem::path> path_to_remove = std::nullopt;
 
-  // Python engine is stored in a separate folder
-  if (ne == kPythonEngine) {
-    return cpp::fail("Not implemented");
-  } else {
+  if (ne == kLlamaRepo) {
     if (version == std::nullopt && variant == std::nullopt) {
       // if no version and variant provided, remove all engines variant of that engine
       path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne;
@@ -203,6 +200,8 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
     } else {
       return cpp::fail("No variant provided");
     }
+  } else {
+    return cpp::fail("Not implemented for engine " + ne);
   }
 
   if (path_to_remove == std::nullopt) {
@@ -890,8 +889,6 @@ cpp::result<void, std::string> EngineService::UnloadEngine(
     auto unload_opts = EngineI::EngineUnloadOption{};
     e->Unload(unload_opts);
     delete e;
-  } else if (std::holds_alternative<PythonEngineI*>(engines_[ne].engine)) {
-    delete std::get<PythonEngineI*>(engines_[ne].engine);
   } else {
     delete std::get<RemoteEngineI*>(engines_[ne].engine);
   }
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index aaff6e65f..4404ac5ea 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -107,9 +107,6 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
   if (std::holds_alternative<EngineI*>(engine_result.value())) {
     std::get<EngineI*>(engine_result.value())
         ->HandleChatCompletion(json_body, std::move(cb));
-  } else if (std::holds_alternative<PythonEngineI*>(engine_result.value())) {
-    return cpp::fail(GetUnsupportedResponse(
-        "Python engine does not support Chat completion"));
   } else {
     std::get<RemoteEngineI*>(engine_result.value())
         ->HandleChatCompletion(json_body, std::move(cb));
@@ -143,9 +140,6 @@ cpp::result<void, InferResult> InferenceService::HandleEmbedding(
   if (std::holds_alternative<EngineI*>(engine_result.value())) {
     std::get<EngineI*>(engine_result.value())
         ->HandleEmbedding(json_body, std::move(cb));
-  } else if (std::holds_alternative<PythonEngineI*>(engine_result.value())) {
-    return cpp::fail(
-        GetUnsupportedResponse("Python engine does not support Embedding"));
   } else {
     std::get<RemoteEngineI*>(engine_result.value())
         ->HandleEmbedding(json_body, std::move(cb));
@@ -183,8 +177,6 @@ InferResult InferenceService::LoadModel(
   };
   if (std::holds_alternative<EngineI*>(engine)) {
     std::get<EngineI*>(engine)->LoadModel(json_body, std::move(cb));
-  } else if (std::holds_alternative<PythonEngineI*>(engine)) {
-    std::get<PythonEngineI*>(engine)->LoadModel(json_body, std::move(cb));
   } else {
     std::get<RemoteEngineI*>(engine)->LoadModel(json_body, std::move(cb));
   }
@@ -221,9 +213,6 @@ InferResult InferenceService::UnloadModel(const std::string& engine_name,
   if (std::holds_alternative<EngineI*>(engine)) {
     std::get<EngineI*>(engine)->UnloadModel(
         std::make_shared<Json::Value>(json_body), std::move(cb));
-  } else if (std::holds_alternative<PythonEngineI*>(engine)) {
-    std::get<PythonEngineI*>(engine)->UnloadModel(
-        std::make_shared<Json::Value>(json_body), std::move(cb));
   } else {
     std::get<RemoteEngineI*>(engine)->UnloadModel(
         std::make_shared<Json::Value>(json_body), std::move(cb));
@@ -262,8 +251,6 @@ InferResult InferenceService::GetModelStatus(
   auto engine = engine_result.value();
   if (std::holds_alternative<EngineI*>(engine)) {
     std::get<EngineI*>(engine)->GetModelStatus(json_body, std::move(cb));
-  } else if (std::holds_alternative<PythonEngineI*>(engine)) {
-    std::get<PythonEngineI*>(engine)->GetModelStatus(json_body, std::move(cb));
   } else {
     std::get<RemoteEngineI*>(engine)->GetModelStatus(json_body, std::move(cb));
   }
@@ -296,9 +283,6 @@ InferResult InferenceService::GetModels(
       if (e->IsSupported("GetModels")) {
         e->GetModels(json_body, std::move(cb));
       }
-    } else if (std::holds_alternative<PythonEngineI*>(loaded_engine)) {
-      std::get<PythonEngineI*>(loaded_engine)
-          ->GetModels(json_body, std::move(cb));
     } else {
       std::get<RemoteEngineI*>(loaded_engine)
           ->GetModels(json_body, std::move(cb));

From 2a2b607cee4f3c21868353ae652b3bd17c1c0e70 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 12:09:51 +0800
Subject: [PATCH 56/73] support models list. add --relocatable for venv

---
 engine/controllers/models.cc                  | 11 +++++
 .../extensions/python-engines/vllm_engine.cc  |  1 +
 engine/services/model_service.cc              | 46 +++++++++++--------
 3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
index 1c937764a..392e8b5dd 100644
--- a/engine/controllers/models.cc
+++ b/engine/controllers/models.cc
@@ -178,6 +178,17 @@ void Models::ListModel(
           data.append(std::move(obj));
           continue;
         }
+
+        if (model_entry.engine == kVllmEngine) {
+          Json::Value obj;
+          obj["id"] = model_entry.model;
+          obj["model"] = model_entry.model;
+          obj["engine"] = model_entry.engine;
+          obj["status"] = "downloaded";
+          data.append(std::move(obj));
+          continue;
+        }
+
         yaml_handler.ModelConfigFromFile(
             fmu::ToAbsoluteCortexDataPath(
                 fs::path(model_entry.path_to_model_yaml))
diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index db291a4eb..725cb3bfd 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -57,6 +57,7 @@ cpp::result<void, std::string> VllmEngine::Download(
   if (!std::filesystem::exists(vllm_path / ".venv")) {
     std::vector<std::string> cmd =
         python_utils::BuildUvCommand("venv", vllm_path_str);
+    cmd.push_back("--relocatable");
     auto result = cortex::process::SpawnProcess(cmd);
     if (result.has_error())
       return cpp::fail(result.error());
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 592b3928d..36b2e013c 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -179,7 +179,12 @@ void ModelService::ForceIndexingModelList() {
     if (model_entry.status != cortex::db::ModelStatus::Downloaded) {
       continue;
     }
+    if (model_entry.engine == kVllmEngine) {
+      // TODO: check if folder still exists?
+      continue;
+    }
     try {
+      // check if path_to_model_yaml still exists
       CTL_DBG(fmu::ToAbsoluteCortexDataPath(
                   fs::path(model_entry.path_to_model_yaml))
                   .string());
@@ -590,14 +595,20 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
     Json::Value json_data;
     // Currently we don't support download vision models, so we need to bypass check
     if (!bypass_model_check) {
-      auto model_entry = db_service_->GetModelInfo(model_handle);
-      if (model_entry.has_error()) {
-        CTL_WRN("Error: " + model_entry.error());
-        return cpp::fail(model_entry.error());
+      auto result = db_service_->GetModelInfo(model_handle);
+      if (result.has_error()) {
+        CTL_WRN("Error: " + result.error());
+        return cpp::fail(result.error());
       }
+      auto model_entry = result.value();
+
+      if (model_entry.engine == kVllmEngine) {
+        return cpp::fail("vLLM engine models are not supported yet.");
+      }
+
       yaml_handler.ModelConfigFromFile(
           fmu::ToAbsoluteCortexDataPath(
-              fs::path(model_entry.value().path_to_model_yaml))
+              fs::path(model_entry.path_to_model_yaml))
               .string());
       auto mc = yaml_handler.GetModelConfig();
 
@@ -605,17 +616,15 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
       if (engine_svc_->IsRemoteEngine(mc.engine)) {
         (void)engine_svc_->LoadEngine(mc.engine);
         config::RemoteModelConfig remote_mc;
-        remote_mc.LoadFromYamlFile(
-            fmu::ToAbsoluteCortexDataPath(
-                fs::path(model_entry.value().path_to_model_yaml))
-                .string());
-        auto remote_engine_entry =
-            engine_svc_->GetEngineByNameAndVariant(mc.engine);
-        if (remote_engine_entry.has_error()) {
-          CTL_WRN("Remote engine error: " + model_entry.error());
-          return cpp::fail(remote_engine_entry.error());
+        remote_mc.LoadFromYamlFile(fmu::ToAbsoluteCortexDataPath(
+                                       fs::path(model_entry.path_to_model_yaml))
+                                       .string());
+        auto result = engine_svc_->GetEngineByNameAndVariant(mc.engine);
+        if (result.has_error()) {
+          CTL_WRN("Remote engine error: " + result.error());
+          return cpp::fail(result.error());
         }
-        auto remote_engine_json = remote_engine_entry.value().ToJson();
+        auto remote_engine_json = result.value().ToJson();
         json_data = remote_mc.ToJson();
 
         json_data["api_key"] = std::move(remote_engine_json["api_key"]);
@@ -623,10 +632,9 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
             !v.empty() && v != "latest") {
           json_data["version"] = v;
         }
-        json_data["model_path"] =
-            fmu::ToAbsoluteCortexDataPath(
-                fs::path(model_entry.value().path_to_model_yaml))
-                .string();
+        json_data["model_path"] = fmu::ToAbsoluteCortexDataPath(
+                                      fs::path(model_entry.path_to_model_yaml))
+                                      .string();
         json_data["metadata"] = std::move(remote_engine_json["metadata"]);
 
         auto ir =

From fffc686b585834932befd1485289a70808e8030e Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 13:35:15 +0800
Subject: [PATCH 57/73] preparation works for start model

---
 .../extensions/python-engines/vllm_engine.cc  | 24 ++++++++++++++++
 .../extensions/python-engines/vllm_engine.h   | 28 ++++++++++---------
 engine/services/engine_service.cc             | 25 +++++------------
 engine/services/model_service.cc              | 16 ++++++++---
 4 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 725cb3bfd..c7ba66793 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -1,6 +1,7 @@
 #include "vllm_engine.h"
 #include "utils/curl_utils.h"
 #include "utils/logging_utils.h"
+#include "utils/system_info_utils.h"
 
 namespace {
 cpp::result<std::string, std::string> GetLatestVllmVersion() {
@@ -28,6 +29,12 @@ VllmEngine::~VllmEngine() {
 cpp::result<void, std::string> VllmEngine::Download(
     std::shared_ptr<DownloadService>& download_service,
     const std::string& version, const std::optional<std::string> variant_name) {
+  auto system_info = system_info_utils::GetSystemInfo();
+  if (!(system_info->os == kLinuxOs && system_info->arch == "amd64" &&
+        system_info_utils::IsNvidiaSmiAvailable()))
+    return cpp::fail(
+        "vLLM engine is only supported on Linux x86_64 with Nvidia GPU.");
+
   if (variant_name.has_value()) {
     return cpp::fail("variant_name must be empty");
   }
@@ -85,6 +92,23 @@ cpp::result<void, std::string> VllmEngine::Download(
   return {};
 }
 
+std::vector<EngineVariantResponse> VllmEngine::GetVariants() {
+  const auto vllm_path = python_utils::GetEnvsPath() / "vllm";
+
+  namespace fs = std::filesystem;
+  if (!fs::exists(vllm_path))
+    return {};
+
+  std::vector<EngineVariantResponse> variants;
+  for (const auto& entry : fs::directory_iterator(vllm_path)) {
+    const auto name = "linux-amd64-cuda";  // arbitrary
+    const auto version_str = "v" + entry.path().filename().string();
+    const EngineVariantResponse variant{name, version_str, kVllmEngine};
+    variants.push_back(variant);
+  }
+  return variants;
+}
+
 void VllmEngine::Load(EngineLoadOption opts) {};
 void VllmEngine::Unload(EngineUnloadOption opts) {};
 
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index 2fcfa0d74..35a97a903 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -1,3 +1,4 @@
+#include "common/engine_servicei.h"
 #include "cortex-common/EngineI.h"
 #include "python_utils.h"
 
@@ -16,38 +17,39 @@ class VllmEngine : public EngineI {
       const std::string& version,
       const std::optional<std::string> variant_name);
 
-  virtual void Load(EngineLoadOption opts) override;
-  virtual void Unload(EngineUnloadOption opts) override;
+  static std::vector<EngineVariantResponse> GetVariants();
+
+  void Load(EngineLoadOption opts) override;
+  void Unload(EngineUnloadOption opts) override;
 
   // cortex.llamacpp interface
-  virtual void HandleChatCompletion(
+  void HandleChatCompletion(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  virtual void HandleEmbedding(
+  void HandleEmbedding(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  virtual void LoadModel(
+  void LoadModel(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  virtual void UnloadModel(
+  void UnloadModel(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
-  virtual void GetModelStatus(
+  void GetModelStatus(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 
   // For backward compatible checking
-  virtual bool IsSupported(const std::string& f) override;
+  bool IsSupported(const std::string& f) override;
 
   // Get list of running models
-  virtual void GetModels(
+  void GetModels(
       std::shared_ptr<Json::Value> jsonBody,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) override;
 
-  virtual bool SetFileLogger(int max_log_lines,
-                             const std::string& log_path) override;
-  virtual void SetLogLevel(trantor::Logger::LogLevel logLevel) override;
+  bool SetFileLogger(int max_log_lines, const std::string& log_path) override;
+  void SetLogLevel(trantor::Logger::LogLevel logLevel) override;
 
   // Stop inflight chat completion in stream mode
-  virtual void StopInferencing(const std::string& model_id) override;
+  void StopInferencing(const std::string& model_id) override;
 };
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index e9b256cc3..b8a3b13d6 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -620,22 +620,8 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const {
   auto ne = cortex::engine::NormalizeEngine(engine);
   auto os = hw_inf_.sys_inf->os;
 
-  // if (ne == kPythonEngine) {
-  //   if (!python_engine::IsUvInstalled()) {
-  //     return {};
-  //   } else {
-  //     // Python engine only means uv is installed.
-  //     // variant name and version don't quite make sense in this context.
-  //     // hence, they are left blank.
-  //     std::vector<EngineVariantResponse> variants;
-  //     variants.push_back(EngineVariantResponse{
-  //         .name = "",
-  //         .version = "",
-  //         .engine = kPythonEngine,
-  //     });
-  //     return variants;
-  //   }
-  // }
+  if (ne == kVllmEngine)
+    return VllmEngine::GetVariants();
 
   auto engines_variants_dir =
       file_manager_utils::GetEnginesContainerPath() / ne;
@@ -931,8 +917,6 @@ cpp::result<bool, std::string> EngineService::IsEngineReady(
     return true;
   }
 
-  auto os = hw_inf_.sys_inf->os;
-
   auto installed_variants = GetInstalledEngineVariants(engine);
   if (installed_variants.has_error()) {
     return cpp::fail(installed_variants.error());
@@ -1119,6 +1103,11 @@ cpp::result<Json::Value, std::string> EngineService::GetRemoteModels(
 
 bool EngineService::IsRemoteEngine(const std::string& engine_name) const {
   auto ne = Repo2Engine(engine_name);
+
+  if (ne == kLlamaEngine || ne == kVllmEngine)
+    return false;
+  return true;
+
   auto local_engines = file_manager_utils::GetCortexConfig().supportedEngines;
   for (auto const& le : local_engines) {
     if (le == ne)
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 36b2e013c..119e12b75 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -208,10 +208,18 @@ std::optional<config::ModelConfig> ModelService::GetDownloadedModel(
     const std::string& modelId) const {
 
   config::YamlHandler yaml_handler;
-  auto model_entry = db_service_->GetModelInfo(modelId);
-  if (!model_entry.has_value()) {
+  auto result = db_service_->GetModelInfo(modelId);
+  if (result.has_error()) {
     return std::nullopt;
   }
+  auto model_entry = result.value();
+
+  // ignore all other params
+  if (model_entry.engine == kVllmEngine) {
+    config::ModelConfig cfg;
+    cfg.engine = kVllmEngine;
+    return cfg;
+  }
 
   try {
     config::YamlHandler yaml_handler;
@@ -219,11 +227,11 @@ std::optional<config::ModelConfig> ModelService::GetDownloadedModel(
     namespace fmu = file_manager_utils;
     yaml_handler.ModelConfigFromFile(
         fmu::ToAbsoluteCortexDataPath(
-            fs::path(model_entry.value().path_to_model_yaml))
+            fs::path(model_entry.path_to_model_yaml))
             .string());
     return yaml_handler.GetModelConfig();
   } catch (const std::exception& e) {
-    LOG_ERROR << "Error reading yaml file '" << model_entry->path_to_model_yaml
+    LOG_ERROR << "Error reading yaml file '" << model_entry.path_to_model_yaml
               << "': " << e.what();
     return std::nullopt;
   }

From cea8020291181227816af1a025a59278b2c38b5c Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 15:56:15 +0800
Subject: [PATCH 58/73] add sync download util. add vLLM version config. some
 boilerplate code to launch model (non-functional atm)

---
 .../extensions/python-engines/python_utils.cc |  79 +++----
 .../extensions/python-engines/python_utils.h  |   3 +-
 .../extensions/python-engines/vllm_engine.cc  | 220 +++++++++++-------
 .../extensions/python-engines/vllm_engine.h   |   6 +-
 engine/services/engine_service.cc             | 106 ++++++++-
 engine/services/engine_service.h              |   4 +
 engine/services/model_service.cc              |  19 +-
 engine/utils/config_yaml_utils.cc             |   7 +-
 engine/utils/config_yaml_utils.h              |   1 +
 engine/utils/curl_utils.cc                    |  44 ++++
 engine/utils/curl_utils.h                     |   7 +-
 11 files changed, 342 insertions(+), 154 deletions(-)

diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc
index 5255fcd0e..07297801e 100644
--- a/engine/extensions/python-engines/python_utils.cc
+++ b/engine/extensions/python-engines/python_utils.cc
@@ -2,6 +2,7 @@
 #include <filesystem>
 
 #include "utils/archive_utils.h"
+#include "utils/curl_utils.h"
 #include "utils/file_manager_utils.h"
 #include "utils/set_permission_utils.h"
 #include "utils/system_info_utils.h"
@@ -23,8 +24,7 @@ std::filesystem::path GetUvPath() {
 bool IsUvInstalled() {
   return std::filesystem::exists(GetUvPath());
 }
-cpp::result<void, std::string> InstallUv(
-    std::shared_ptr<DownloadService>& download_service) {
+cpp::result<void, std::string> InstallUv() {
   const auto py_bin_path = GetPythonEnginesPath() / "bin";
   std::filesystem::create_directories(py_bin_path);
 
@@ -58,52 +58,37 @@ cpp::result<void, std::string> InstallUv(
   const std::string url = url_stream.str();
   CTL_INF("Download uv from " << url);
 
-  auto on_finished = [py_bin_path,
-                      uv_version](const DownloadTask& finishedTask) {
-    // try to unzip the downloaded file
-    const std::string download_path = finishedTask.items[0].localPath.string();
-
-    archive_utils::ExtractArchive(download_path, py_bin_path.string(), true);
-    set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
-    std::filesystem::remove(download_path);
-
-    // install Python3.10 from Astral. this will be preferred over system
-    // Python when possible.
-    // NOTE: currently this will install to a user-wide directory. we can
-    // install to a specific location using `--install-dir`, but later
-    // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use
-    // this Python installation.
-    // we can add this once we allow passing custom env var to SpawnProcess().
-    // https://docs.astral.sh/uv/reference/cli/#uv-python-install
-    std::vector<std::string> command = BuildUvCommand("python");
-    command.push_back("install");
-    command.push_back("3.10");
-
-    // NOTE: errors in download callback won't be propagated to caller
-    auto result = cortex::process::SpawnProcess(command);
-    if (result.has_error()) {
-      CTL_ERR(result.error());
-      return;
-    }
-
-    if (!cortex::process::WaitProcess(result.value())) {
-      CTL_ERR("Process spawned but fail to wait");
-      return;
-    }
-  };
-
-  auto downloadTask = DownloadTask{.id = "python-uv",
-                                   .type = DownloadType::Engine,
-                                   .items = {DownloadItem{
-                                       .id = "python-uv",
-                                       .downloadUrl = url,
-                                       .localPath = py_bin_path / fname,
-                                   }}};
-
-  auto add_task_result = download_service->AddTask(downloadTask, on_finished);
-  if (add_task_result.has_error()) {
-    return cpp::fail(add_task_result.error());
+  const auto save_path = py_bin_path / fname;
+  auto res = curl_utils::SimpleDownload(url, save_path.string());
+  if (res.has_error())
+    return res;
+
+  archive_utils::ExtractArchive(save_path, py_bin_path.string(), true);
+  set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
+  std::filesystem::remove(save_path);
+
+  // install Python3.10 from Astral. this will be preferred over system
+  // Python when possible.
+  // NOTE: currently this will install to a user-wide directory. we can
+  // install to a specific location using `--install-dir`, but later
+  // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use
+  // this Python installation.
+  // we can add this once we allow passing custom env var to SpawnProcess().
+  // https://docs.astral.sh/uv/reference/cli/#uv-python-install
+  std::vector<std::string> command = BuildUvCommand("python");
+  command.push_back("install");
+  command.push_back("3.10");
+
+  auto result = cortex::process::SpawnProcess(command);
+  if (result.has_error())
+    return cpp::fail(result.error());
+
+  if (!cortex::process::WaitProcess(result.value())) {
+    const auto msg = "Process spawned but fail to wait";
+    CTL_ERR(msg);
+    return cpp::fail(msg);
   }
+
   return {};
 }
 
diff --git a/engine/extensions/python-engines/python_utils.h b/engine/extensions/python-engines/python_utils.h
index 31b0ca0ad..97b2d3f15 100644
--- a/engine/extensions/python-engines/python_utils.h
+++ b/engine/extensions/python-engines/python_utils.h
@@ -15,8 +15,7 @@ std::filesystem::path GetUvPath();
 
 // UV-related functions
 bool IsUvInstalled();
-cpp::result<void, std::string> InstallUv(
-    std::shared_ptr<DownloadService>& download_service);
+cpp::result<void, std::string> InstallUv();
 std::vector<std::string> BuildUvCommand(const std::string& action,
                                         const std::string& directory = "");
 // cpp::result<void, std::string> UvDownloadDeps(
diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index c7ba66793..4229c32df 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -1,21 +1,29 @@
 #include "vllm_engine.h"
+#include "services/engine_service.h"
 #include "utils/curl_utils.h"
 #include "utils/logging_utils.h"
 #include "utils/system_info_utils.h"
 
-namespace {
-cpp::result<std::string, std::string> GetLatestVllmVersion() {
-  auto result = curl_utils::SimpleGetJson("https://pypi.org/pypi/vllm/json");
-  if (result.has_error())
-    return result.error();
+static std::pair<Json::Value, Json::Value> CreateResponse(
+    const std::string& msg, int code) {
 
-  auto version_value = result.value()["info"]["version"];
-  if (version_value.isNull())
-    return cpp::fail("Can't find version in the response");
+  Json::Value status, res;
+  const bool has_error = code != 200;
 
-  return version_value.asString();
+  status["is_done"] = true;
+  status["has_error"] = has_error;
+  status["is_stream"] = false;
+  status["status_code"] = code;
+
+  if (has_error) {
+    CTL_ERR(msg);
+    res["error"] = msg;
+  } else {
+    res["status"] = msg;
+  }
+
+  return {status, res};
 }
-}  // namespace
 
 VllmEngine::~VllmEngine() {
   // NOTE: what happens if we can't kill subprocess?
@@ -26,72 +34,6 @@ VllmEngine::~VllmEngine() {
   }
 }
 
-cpp::result<void, std::string> VllmEngine::Download(
-    std::shared_ptr<DownloadService>& download_service,
-    const std::string& version, const std::optional<std::string> variant_name) {
-  auto system_info = system_info_utils::GetSystemInfo();
-  if (!(system_info->os == kLinuxOs && system_info->arch == "amd64" &&
-        system_info_utils::IsNvidiaSmiAvailable()))
-    return cpp::fail(
-        "vLLM engine is only supported on Linux x86_64 with Nvidia GPU.");
-
-  if (variant_name.has_value()) {
-    return cpp::fail("variant_name must be empty");
-  }
-
-  if (!python_utils::IsUvInstalled()) {
-    auto result = python_utils::InstallUv(download_service);
-    if (result.has_error())
-      return result;
-  }
-
-  std::string concrete_version = version;
-  if (version == "latest") {
-    auto result = GetLatestVllmVersion();
-    if (result.has_error())
-      return cpp::fail(result.error());
-
-    concrete_version = result.value();
-  }
-  CTL_INF("Download vLLM " << concrete_version);
-
-  const auto vllm_path =
-      python_utils::GetEnvsPath() / "vllm" / concrete_version;
-  std::filesystem::create_directories(vllm_path);
-  const auto vllm_path_str = vllm_path.string();
-
-  // initialize venv
-  if (!std::filesystem::exists(vllm_path / ".venv")) {
-    std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("venv", vllm_path_str);
-    cmd.push_back("--relocatable");
-    auto result = cortex::process::SpawnProcess(cmd);
-    if (result.has_error())
-      return cpp::fail(result.error());
-
-    // TODO: check return code
-    // NOTE: these are not async
-    cortex::process::WaitProcess(result.value());
-  }
-
-  // install vLLM
-  {
-    std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("pip", vllm_path_str);
-    cmd.push_back("install");
-    cmd.push_back("vllm==" + concrete_version);
-    auto result = cortex::process::SpawnProcess(cmd);
-    if (result.has_error())
-      return cpp::fail(result.error());
-
-    // TODO: check return code
-    // NOTE: these are not async
-    cortex::process::WaitProcess(result.value());
-  }
-
-  return {};
-}
-
 std::vector<EngineVariantResponse> VllmEngine::GetVariants() {
   const auto vllm_path = python_utils::GetEnvsPath() / "vllm";
 
@@ -109,29 +51,123 @@ std::vector<EngineVariantResponse> VllmEngine::GetVariants() {
   return variants;
 }
 
-void VllmEngine::Load(EngineLoadOption opts) {};
-void VllmEngine::Unload(EngineUnloadOption opts) {};
+// NOTE: doesn't do anything
+void VllmEngine::Load(EngineLoadOption opts) {
+  CTL_WRN("EngineLoadOption is ignored");
+  return;
+};
+
+// NOTE: doesn't do anything
+void VllmEngine::Unload(EngineUnloadOption opts) {
+  return;
+};
 
 // cortex.llamacpp interface
 void VllmEngine::HandleChatCompletion(
     std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
 
 void VllmEngine::HandleEmbedding(
     std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
 
 void VllmEngine::LoadModel(
     std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+
+  if (!json_body->isMember("model")) {
+    auto [status, error] =
+        CreateResponse("Missing required fields: model", 400);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+
+  {
+    std::unique_lock write_lock(mutex);
+    if (model_process_map.find(model) != model_process_map.end()) {
+      // check if model is still alive
+      if (model_process_map[model].IsAlive()) {
+        auto [status, error] = CreateResponse("Model already loaded!", 409);
+        callback(std::move(status), std::move(error));
+        return;
+      } else {
+        // if model has exited, try to load model again
+        CTL_WRN("Model " << model << " has exited unexpectedly");
+        model_process_map.erase(model);
+      }
+    }
+  }
+
+  // pid_t pid;
+  // try {
+  //   // https://docs.astral.sh/uv/reference/cli/#uv-run
+  //   std::vector<std::string> command =
+  //       python_utils::BuildUvCommand("run", model_dir.string());
+  //   for (const auto& item : py_cfg.entrypoint)
+  //     command.push_back(item);
+
+  //   const std::string stdout_path = (model_dir / "stdout.txt").string();
+  //   const std::string stderr_path = (model_dir / "stderr.txt").string();
+
+  //   // create empty stdout.txt and stderr.txt for redirection
+  //   if (!std::filesystem::exists(stdout_path))
+  //     std::ofstream(stdout_path).flush();
+  //   if (!std::filesystem::exists(stderr_path))
+  //     std::ofstream(stderr_path).flush();
+
+  //   auto result =
+  //       cortex::process::SpawnProcess(command, stdout_path, stderr_path);
+  //   if (result.has_error()) {
+  //     throw std::runtime_error(result.error());
+  //   }
+
+  //   PythonSubprocess py_proc;
+  //   py_proc.proc_info = result.value();
+  //   py_proc.port = py_cfg.port;
+  //   py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() /
+  //                        std::chrono::milliseconds(1);
+
+  //   pid = py_proc.proc_info.pid;
+
+  //   std::unique_lock write_lock(mutex);
+  //   model_process_map[model] = py_proc;
+
+  // } catch (const std::exception& e) {
+  //   auto e_msg = e.what();
+  //   auto [status, error] = CreateResponse(e_msg, k500InternalServerError);
+  //   callback(std::move(status), std::move(error));
+  //   return;
+  // }
+
+  // auto [status, res] = CreateResponse(
+  //     "Model loaded successfully with pid: " + std::to_string(pid), k200OK);
+  // callback(std::move(status), std::move(res));
+
+  // CTL_WRN("Not implemented");
+  // throw std::runtime_error("Not implemented");
+};
 
 void VllmEngine::UnloadModel(
     std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
 
 void VllmEngine::GetModelStatus(
     std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
 
 // For backward compatible checking
 bool VllmEngine::IsSupported(const std::string& f) {
@@ -141,12 +177,22 @@ bool VllmEngine::IsSupported(const std::string& f) {
 // Get list of running models
 void VllmEngine::GetModels(
     std::shared_ptr<Json::Value> jsonBody,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {};
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
 
 bool VllmEngine::SetFileLogger(int max_log_lines, const std::string& log_path) {
-  return true;
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
+void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
 };
-void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {};
 
 // Stop inflight chat completion in stream mode
-void VllmEngine::StopInferencing(const std::string& model_id) {};
+void VllmEngine::StopInferencing(const std::string& model_id) {
+  CTL_WRN("Not implemented");
+  throw std::runtime_error("Not implemented");
+};
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index 35a97a903..a6024185e 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -1,3 +1,4 @@
+#include <functional>
 #include "common/engine_servicei.h"
 #include "cortex-common/EngineI.h"
 #include "python_utils.h"
@@ -12,11 +13,6 @@ class VllmEngine : public EngineI {
   VllmEngine() {};
   ~VllmEngine();
 
-  static cpp::result<void, std::string> Download(
-      std::shared_ptr<DownloadService>& download_service,
-      const std::string& version,
-      const std::optional<std::string> variant_name);
-
   static std::vector<EngineVariantResponse> GetVariants();
 
   void Load(EngineLoadOption opts) override;
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index b8a3b13d6..80321e18d 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -225,11 +225,11 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
     const std::string& engine, const std::string& version,
     const std::optional<std::string> variant_name) {
 
-  if (engine == kLlamaRepo) {
+  if (engine == kLlamaRepo)
     return DownloadLlamaCpp(version, variant_name);
-  } else if (engine == kVllmEngine) {
-    return VllmEngine::Download(download_service_, version, variant_name);
-  }
+  if (engine == kVllmEngine)
+    return DownloadVllm(version, variant_name);
+
   return cpp::fail("Unknown engine " + engine);
 }
 
@@ -372,6 +372,83 @@ cpp::result<void, std::string> EngineService::DownloadLlamaCpp(
   return {};
 }
 
+cpp::result<void, std::string> EngineService::DownloadVllm(
+    const std::string& version, const std::optional<std::string> variant_name) {
+
+  auto system_info = system_info_utils::GetSystemInfo();
+  if (!(system_info->os == kLinuxOs && system_info->arch == "amd64" &&
+        system_info_utils::IsNvidiaSmiAvailable()))
+    return cpp::fail(
+        "vLLM engine is only supported on Linux x86_64 with Nvidia GPU.");
+
+  if (variant_name.has_value()) {
+    return cpp::fail("variant_name must be empty");
+  }
+
+  // NOTE: everything below is not async
+  // to make it async, we have to run everything in a thread (spawning and waiting
+  // for subprocesses)
+  if (!python_utils::IsUvInstalled()) {
+    auto result = python_utils::InstallUv();
+    if (result.has_error())
+      return result;
+  }
+
+  std::string concrete_version = version;
+  if (version == "latest") {
+    auto result = curl_utils::SimpleGetJson("https://pypi.org/pypi/vllm/json");
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    auto version_value = result.value()["info"]["version"];
+    if (version_value.isNull())
+      return cpp::fail("Can't find version in the response");
+    concrete_version = version_value.asString();
+  }
+  CTL_INF("Download vLLM " << concrete_version);
+
+  const auto vllm_path =
+      python_utils::GetEnvsPath() / "vllm" / concrete_version;
+  std::filesystem::create_directories(vllm_path);
+  const auto vllm_path_str = vllm_path.string();
+
+  // initialize venv
+  if (!std::filesystem::exists(vllm_path / ".venv")) {
+    std::vector<std::string> cmd =
+        python_utils::BuildUvCommand("venv", vllm_path_str);
+    cmd.push_back("--relocatable");
+    auto result = cortex::process::SpawnProcess(cmd);
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    // TODO: check return code
+    // NOTE: these are not async
+    cortex::process::WaitProcess(result.value());
+  }
+
+  // install vLLM
+  {
+    std::vector<std::string> cmd =
+        python_utils::BuildUvCommand("pip", vllm_path_str);
+    cmd.push_back("install");
+    cmd.push_back("vllm==" + concrete_version);
+    auto result = cortex::process::SpawnProcess(cmd);
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    // TODO: check return code
+    // one reason this may fail is that the requested version does not exist
+    // NOTE: these are not async
+    cortex::process::WaitProcess(result.value());
+  }
+
+  auto result = SetDefaultEngineVariant(kVllmEngine, concrete_version, "");
+  if (result.has_error())
+    return cpp::fail(result.error());
+
+  return {};
+}
+
 cpp::result<bool, std::string> EngineService::DownloadCuda(
     const std::string& engine, bool async) {
   if (hw_inf_.sys_inf->os == "mac" || engine != kLlamaRepo) {
@@ -553,8 +630,14 @@ EngineService::SetDefaultEngineVariant(const std::string& engine,
   auto normalized_version = string_utils::RemoveSubstring(version, "v");
 
   auto config = file_manager_utils::GetCortexConfig();
-  config.llamacppVersion = "v" + normalized_version;
-  config.llamacppVariant = variant;
+  if (ne == kLlamaRepo) {
+    config.llamacppVersion = "v" + normalized_version;
+    config.llamacppVariant = variant;
+  } else if (ne == kVllmEngine) {
+    config.vllmVersion = "v" + normalized_version;
+  } else {
+    return cpp::fail("Unrecognized engine " + engine);
+  }
   auto result = file_manager_utils::UpdateCortexConfig(config);
   if (result.has_error()) {
     return cpp::fail(result.error());
@@ -686,6 +769,7 @@ cpp::result<void, std::string> EngineService::LoadEngine(
     CTL_INF("Engine " << ne << " is already loaded");
     return {};
   }
+  CTL_INF("Loading engine: " << ne);
 
   // Check for remote engine
   if (IsRemoteEngine(engine_name)) {
@@ -702,9 +786,17 @@ cpp::result<void, std::string> EngineService::LoadEngine(
     return {};
   }
 
+  // check for vLLM engine
+  if (engine_name == kVllmEngine) {
+    auto engine = new VllmEngine();
+    EngineI::EngineLoadOption load_opts{};
+    engine->Load(load_opts);
+    engines_[engine_name].engine = engine;
+    return {};
+  }
+
   // End hard code
 
-  CTL_INF("Loading engine: " << ne);
 #if defined(_WIN32) || defined(_WIN64) || defined(__linux__)
   CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string());
 #endif
diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h
index a4328f4d2..8e745f55f 100644
--- a/engine/services/engine_service.h
+++ b/engine/services/engine_service.h
@@ -166,6 +166,10 @@ class EngineService : public EngineServiceI {
       const std::string& version = "latest",
       const std::optional<std::string> variant_name = std::nullopt);
 
+  cpp::result<void, std::string> DownloadVllm(
+    const std::string& version = "latest",
+    const std::optional<std::string> variant_name = std::nullopt);
+
   cpp::result<bool, std::string> DownloadCuda(const std::string& engine,
                                               bool async = false);
 
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 119e12b75..bccc5f842 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -226,8 +226,7 @@ std::optional<config::ModelConfig> ModelService::GetDownloadedModel(
     namespace fs = std::filesystem;
     namespace fmu = file_manager_utils;
     yaml_handler.ModelConfigFromFile(
-        fmu::ToAbsoluteCortexDataPath(
-            fs::path(model_entry.path_to_model_yaml))
+        fmu::ToAbsoluteCortexDataPath(fs::path(model_entry.path_to_model_yaml))
             .string());
     return yaml_handler.GetModelConfig();
   } catch (const std::exception& e) {
@@ -611,7 +610,21 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
       auto model_entry = result.value();
 
       if (model_entry.engine == kVllmEngine) {
-        return cpp::fail("vLLM engine models are not supported yet.");
+        Json::Value json_data;
+        json_data["model"] = model_handle;
+        json_data["engine"] = kVllmEngine;
+        auto [status, data] =
+            inference_svc_->LoadModel(std::make_shared<Json::Value>(json_data));
+
+        auto status_code = status["status_code"].asInt();
+        if (status == drogon::k200OK) {
+          return StartModelResult{true, "vLLM engine ignores all params override"};
+        } else if (status == drogon::k409Conflict) {
+          CTL_INF("Model '" + model_handle + "' is already loaded");
+          return StartModelResult{.success = true, .warning = ""};
+        } else {
+          return cpp::fail("Model failed to start: " + data["message"].asString());
+        }
       }
 
       yaml_handler.ModelConfigFromFile(
diff --git a/engine/utils/config_yaml_utils.cc b/engine/utils/config_yaml_utils.cc
index dc47590c4..584929da4 100644
--- a/engine/utils/config_yaml_utils.cc
+++ b/engine/utils/config_yaml_utils.cc
@@ -36,6 +36,7 @@ cpp::result<void, std::string> CortexConfigMgr::DumpYamlConfig(
     node["gitHubToken"] = config.gitHubToken;
     node["llamacppVariant"] = config.llamacppVariant;
     node["llamacppVersion"] = config.llamacppVersion;
+    node["vllmVersion"] = config.vllmVersion;
     node["enableCors"] = config.enableCors;
     node["allowedOrigins"] = config.allowedOrigins;
     node["proxyUrl"] = config.proxyUrl;
@@ -80,7 +81,8 @@ CortexConfig CortexConfigMgr::FromYaml(const std::string& path,
          !node["logOnnxPath"] || !node["huggingFaceToken"] ||
          !node["gitHubUserAgent"] || !node["gitHubToken"] ||
          !node["llamacppVariant"] || !node["llamacppVersion"] ||
-         !node["enableCors"] || !node["allowedOrigins"] || !node["proxyUrl"] ||
+         !node["vllmVersion"] || !node["enableCors"] ||
+         !node["allowedOrigins"] || !node["proxyUrl"] ||
          !node["proxyUsername"] || !node["proxyPassword"] ||
          !node["verifyPeerSsl"] || !node["verifyHostSsl"] ||
          !node["verifyProxySsl"] || !node["verifyProxyHostSsl"] ||
@@ -138,6 +140,9 @@ CortexConfig CortexConfigMgr::FromYaml(const std::string& path,
         .llamacppVersion = node["llamacppVersion"]
                                ? node["llamacppVersion"].as<std::string>()
                                : default_cfg.llamacppVersion,
+        .vllmVersion = node["vllmVersion"]
+                           ? node["vllmVersion"].as<std::string>()
+                           : default_cfg.vllmVersion,
         .enableCors = node["enableCors"] ? node["enableCors"].as<bool>()
                                          : default_cfg.enableCors,
         .allowedOrigins =
diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h
index c871fd100..fab535a88 100644
--- a/engine/utils/config_yaml_utils.h
+++ b/engine/utils/config_yaml_utils.h
@@ -48,6 +48,7 @@ struct CortexConfig {
   std::string gitHubToken;
   std::string llamacppVariant;
   std::string llamacppVersion;
+  std::string vllmVersion;
 
   bool enableCors;
   std::vector<std::string> allowedOrigins;
diff --git a/engine/utils/curl_utils.cc b/engine/utils/curl_utils.cc
index 859c629d1..cfe847e04 100644
--- a/engine/utils/curl_utils.cc
+++ b/engine/utils/curl_utils.cc
@@ -373,4 +373,48 @@ cpp::result<Json::Value, std::string> SimplePatchJson(const std::string& url,
 
   return root;
 }
+
+cpp::result<void, std::string> SimpleDownload(const std::string& url,
+                                              const std::string& save_path,
+                                              const int timeout) {
+  auto curl = curl_easy_init();
+  if (!curl) {
+    return cpp::fail("Failed to init CURL");
+  }
+
+  auto headers = GetHeaders(url);
+  curl_slist* curl_headers = nullptr;
+  if (headers) {
+    for (const auto& [key, value] : headers->m) {
+      auto header = key + ": " + value;
+      curl_headers = curl_slist_append(curl_headers, header.c_str());
+    }
+    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, curl_headers);
+  }
+
+  auto file = fopen(save_path.c_str(), "wb");
+  if (!file)
+    return cpp::fail("Failed to open " + save_path);
+
+  SetUpProxy(curl, url);
+  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fwrite);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, file);
+  if (timeout > 0) {
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
+  }
+
+  // Perform the request
+  auto res = curl_easy_perform(curl);
+  fclose(file);
+  curl_slist_free_all(curl_headers);
+  curl_easy_cleanup(curl);
+  if (res != CURLE_OK) {
+    return cpp::fail("CURL request failed: " +
+                     std::string{curl_easy_strerror(res)});
+  }
+
+  return {};
+}
 }  // namespace curl_utils
\ No newline at end of file
diff --git a/engine/utils/curl_utils.h b/engine/utils/curl_utils.h
index 9035b6b3c..91a67077e 100644
--- a/engine/utils/curl_utils.h
+++ b/engine/utils/curl_utils.h
@@ -37,8 +37,8 @@ cpp::result<YAML::Node, std::string> ReadRemoteYaml(const std::string& url);
  */
 cpp::result<Json::Value, std::string> SimpleGetJson(const std::string& url,
                                                     const int timeout = -1);
-cpp::result<Json::Value, std::string> SimpleGetJsonRecursive(const std::string& url,
-                                                    const int timeout = -1);
+cpp::result<Json::Value, std::string> SimpleGetJsonRecursive(
+    const std::string& url, const int timeout = -1);
 
 cpp::result<Json::Value, std::string> SimplePostJson(
     const std::string& url, const std::string& body = "");
@@ -49,4 +49,7 @@ cpp::result<Json::Value, std::string> SimpleDeleteJson(
 cpp::result<Json::Value, std::string> SimplePatchJson(
     const std::string& url, const std::string& body = "");
 
+cpp::result<void, std::string> SimpleDownload(const std::string& url,
+                                              const std::string& save_path,
+                                              const int timeout = -1);
 }  // namespace curl_utils

From 86d4c01280542716e8cc54151ce378001e91ed63 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 16:00:45 +0800
Subject: [PATCH 59/73] list engines

---
 engine/services/engine_service.cc | 1 +
 engine/utils/config_yaml_utils.h  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 80321e18d..26e4427a0 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -1210,5 +1210,6 @@ bool EngineService::IsRemoteEngine(const std::string& engine_name) const {
 
 cpp::result<std::vector<std::string>, std::string>
 EngineService::GetSupportedEngineNames() {
+  return config_yaml_utils::kDefaultSupportedEngines;
   return file_manager_utils::GetCortexConfig().supportedEngines;
 }
diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h
index fab535a88..f41b00e54 100644
--- a/engine/utils/config_yaml_utils.h
+++ b/engine/utils/config_yaml_utils.h
@@ -24,7 +24,7 @@ constexpr const auto kDefaultCorsEnabled = true;
 const std::vector<std::string> kDefaultEnabledOrigins{
     "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"};
 constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1";
-const std::vector<std::string> kDefaultSupportedEngines{kLlamaEngine};
+const std::vector<std::string> kDefaultSupportedEngines{kLlamaEngine, kVllmEngine};
 
 struct CortexConfig {
   std::string logFolderPath;

From ec8b36d76dbbedea6f5ce5e8cdd5765f5e86f681 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 17:17:30 +0800
Subject: [PATCH 60/73] load and unload model

---
 .../extensions/python-engines/vllm_engine.cc  | 192 ++++++++++++------
 .../extensions/python-engines/vllm_engine.h   |   8 +-
 engine/services/engine_service.cc             |  34 +++-
 engine/services/model_service.cc              |  39 ++--
 4 files changed, 186 insertions(+), 87 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 4229c32df..6dbda426b 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -4,6 +4,10 @@
 #include "utils/logging_utils.h"
 #include "utils/system_info_utils.h"
 
+namespace {
+// figure out port of current running process
+const constexpr int CORTEX_PORT = 3928;
+
 static std::pair<Json::Value, Json::Value> CreateResponse(
     const std::string& msg, int code) {
 
@@ -24,6 +28,10 @@ static std::pair<Json::Value, Json::Value> CreateResponse(
 
   return {status, res};
 }
+}  // namespace
+
+// cortex_port + 0 is always used (by cortex itself)
+VllmEngine::VllmEngine() : port_offsets_{true} {}
 
 VllmEngine::~VllmEngine() {
   // NOTE: what happens if we can't kill subprocess?
@@ -51,18 +59,15 @@ std::vector<EngineVariantResponse> VllmEngine::GetVariants() {
   return variants;
 }
 
-// NOTE: doesn't do anything
 void VllmEngine::Load(EngineLoadOption opts) {
-  CTL_WRN("EngineLoadOption is ignored");
+  version_ = opts.engine_path;  // engine path actually contains version info
+  if (version_[0] == 'v')
+    version_ = version_.substr(1);
   return;
 };
 
-// NOTE: doesn't do anything
-void VllmEngine::Unload(EngineUnloadOption opts) {
-  return;
-};
+void VllmEngine::Unload(EngineUnloadOption opts) {};
 
-// cortex.llamacpp interface
 void VllmEngine::HandleChatCompletion(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
@@ -93,73 +98,142 @@ void VllmEngine::LoadModel(
   {
     std::unique_lock write_lock(mutex);
     if (model_process_map.find(model) != model_process_map.end()) {
-      // check if model is still alive
-      if (model_process_map[model].IsAlive()) {
+      auto proc = model_process_map[model];
+
+      if (proc.IsAlive()) {
         auto [status, error] = CreateResponse("Model already loaded!", 409);
         callback(std::move(status), std::move(error));
         return;
       } else {
-        // if model has exited, try to load model again
+        // if model has exited, try to load model again?
         CTL_WRN("Model " << model << " has exited unexpectedly");
         model_process_map.erase(model);
+        port_offsets_[proc.port - CORTEX_PORT] = false;  // free the port
       }
     }
   }
 
-  // pid_t pid;
-  // try {
-  //   // https://docs.astral.sh/uv/reference/cli/#uv-run
-  //   std::vector<std::string> command =
-  //       python_utils::BuildUvCommand("run", model_dir.string());
-  //   for (const auto& item : py_cfg.entrypoint)
-  //     command.push_back(item);
-
-  //   const std::string stdout_path = (model_dir / "stdout.txt").string();
-  //   const std::string stderr_path = (model_dir / "stderr.txt").string();
-
-  //   // create empty stdout.txt and stderr.txt for redirection
-  //   if (!std::filesystem::exists(stdout_path))
-  //     std::ofstream(stdout_path).flush();
-  //   if (!std::filesystem::exists(stderr_path))
-  //     std::ofstream(stderr_path).flush();
-
-  //   auto result =
-  //       cortex::process::SpawnProcess(command, stdout_path, stderr_path);
-  //   if (result.has_error()) {
-  //     throw std::runtime_error(result.error());
-  //   }
-
-  //   PythonSubprocess py_proc;
-  //   py_proc.proc_info = result.value();
-  //   py_proc.port = py_cfg.port;
-  //   py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() /
-  //                        std::chrono::milliseconds(1);
-
-  //   pid = py_proc.proc_info.pid;
-
-  //   std::unique_lock write_lock(mutex);
-  //   model_process_map[model] = py_proc;
-
-  // } catch (const std::exception& e) {
-  //   auto e_msg = e.what();
-  //   auto [status, error] = CreateResponse(e_msg, k500InternalServerError);
-  //   callback(std::move(status), std::move(error));
-  //   return;
-  // }
-
-  // auto [status, res] = CreateResponse(
-  //     "Model loaded successfully with pid: " + std::to_string(pid), k200OK);
-  // callback(std::move(status), std::move(res));
-
-  // CTL_WRN("Not implemented");
-  // throw std::runtime_error("Not implemented");
+  pid_t pid;
+  try {
+    namespace fs = std::filesystem;
+
+    const auto model_path = file_manager_utils::GetCortexDataPath() / "models" /
+                            kHuggingFaceHost / model;
+
+    auto env_dir = python_utils::GetEnvsPath() / "vllm" / version_;
+    if (!fs::exists(env_dir))
+      throw std::runtime_error(env_dir.string() + " does not exist");
+
+    int offset = 1;
+    for (;; offset++) {
+      // add this guard to prevent endless loop
+      if (offset >= 100)
+        throw std::runtime_error("Unable to find an available port");
+
+      if (port_offsets_.size() <= offset)
+        port_offsets_.push_back(false);
+
+      // check if port is used
+      if (!port_offsets_[offset])
+        break;
+    }
+    const int port = CORTEX_PORT + offset;
+
+    // https://docs.astral.sh/uv/reference/cli/#uv-run
+    // TODO: pass more args
+    // TOOD: figure out how to set env vars
+    // TOOD: set logging config
+    std::vector<std::string> cmd =
+        python_utils::BuildUvCommand("run", env_dir.string());
+    cmd.push_back("vllm");
+    cmd.push_back("serve");
+    cmd.push_back(model_path.string());
+    cmd.push_back("--port");
+    cmd.push_back(std::to_string(port));
+
+    auto result = cortex::process::SpawnProcess(cmd);
+    if (result.has_error()) {
+      throw std::runtime_error(result.error());
+    }
+
+    python_utils::PythonSubprocess py_proc;
+    py_proc.proc_info = result.value();
+    py_proc.port = port;
+    py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() /
+                         std::chrono::milliseconds(1);
+
+    pid = py_proc.proc_info.pid;
+
+    std::unique_lock write_lock(mutex);
+    model_process_map[model] = py_proc;
+
+  } catch (const std::exception& e) {
+    auto e_msg = e.what();
+    auto [status, error] = CreateResponse(e_msg, 500);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  auto [status, res] = CreateResponse(
+      "Model loaded successfully with pid: " + std::to_string(pid), 200);
+  callback(std::move(status), std::move(res));
 };
 
 void VllmEngine::UnloadModel(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  CTL_WRN("Not implemented");
-  throw std::runtime_error("Not implemented");
+  if (!json_body->isMember("model")) {
+    auto [status, error] = CreateResponse("Missing required field: model", 400);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+
+  // check if model has started
+  {
+    std::shared_lock read_lock(mutex);
+    if (model_process_map.find(model) == model_process_map.end()) {
+      const std::string msg = "Model " + model + " has not been loaded yet.";
+      auto [status, error] = CreateResponse(msg, 400);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+  }
+
+  // we know that model has started
+  {
+    std::unique_lock write_lock(mutex);
+    auto proc = model_process_map[model];
+
+    // TODO: we can use vLLM health check endpoint
+    // check if subprocess is still alive
+    // NOTE: is this step necessary? the subprocess could have terminated
+    // after .IsAlive() and before .Kill() later.
+    if (!proc.IsAlive()) {
+      model_process_map.erase(model);
+      port_offsets_[proc.port - CORTEX_PORT] = false;  // free the port
+
+      const std::string msg = "Model " + model + " stopped running.";
+      auto [status, error] = CreateResponse(msg, 400);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+
+    // subprocess is alive. we kill it here.
+    if (!model_process_map[model].Kill()) {
+      const std::string msg = "Unable to kill process of model " + model;
+      auto [status, error] = CreateResponse(msg, 500);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+
+    model_process_map.erase(model);
+    port_offsets_[proc.port - CORTEX_PORT] = false;  // free the port
+  }
+
+  auto [status, res] = CreateResponse("Unload model successfully", 200);
+  callback(std::move(status), std::move(res));
 };
 
 void VllmEngine::GetModelStatus(
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index a6024185e..c3e073aae 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -5,12 +5,18 @@
 
 class VllmEngine : public EngineI {
  private:
+  std::string version_;
+
+  // port_offsets_[i] == true means cortex_port + i is used
+  // otherwise, cortex_port + i is not used
+  std::vector<bool> port_offsets_;
+
   mutable std::shared_mutex mutex;
   std::unordered_map<std::string, python_utils::PythonSubprocess>
       model_process_map;
 
  public:
-  VllmEngine() {};
+  VllmEngine();
   ~VllmEngine();
 
   static std::vector<EngineVariantResponse> GetVariants();
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 26e4427a0..60d52846b 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -677,18 +677,23 @@ cpp::result<bool, std::string> EngineService::IsEngineVariantReady(
 cpp::result<DefaultEngineVariant, std::string>
 EngineService::GetDefaultEngineVariant(const std::string& engine) {
   auto ne = cortex::engine::NormalizeEngine(engine);
-  // current we don't support other engine
-  if (ne != kLlamaRepo) {
-    return cpp::fail("Engine " + engine + " is not supported yet!");
-  }
 
   auto config = file_manager_utils::GetCortexConfig();
-  auto variant = config.llamacppVariant;
-  auto version = config.llamacppVersion;
-
-  if (variant.empty() || version.empty()) {
-    return cpp::fail("Default engine variant for " + engine +
-                     " is not set yet!");
+  std::string variant, version;
+  if (engine == kLlamaEngine) {
+    variant = config.llamacppVariant;
+    version = config.llamacppVersion;
+    if (variant.empty() || version.empty())
+      return cpp::fail("Default engine version and variant for " + engine +
+                       " is not set yet!");
+  } else if (engine == kVllmEngine) {
+    variant = "";
+    version = config.vllmVersion;
+    if (version.empty())
+      return cpp::fail("Default engine version for " + engine +
+                       " is not set yet!");
+  } else {
+    return cpp::fail("Engine " + engine + " is not supported yet!");
   }
 
   return DefaultEngineVariant{
@@ -789,7 +794,14 @@ cpp::result<void, std::string> EngineService::LoadEngine(
   // check for vLLM engine
   if (engine_name == kVllmEngine) {
     auto engine = new VllmEngine();
-    EngineI::EngineLoadOption load_opts{};
+    EngineI::EngineLoadOption load_opts;
+
+    auto result = GetDefaultEngineVariant(engine_name);
+    if (result.has_error())
+      return cpp::fail(result.error());
+
+    // we set version to engine_path
+    load_opts.engine_path = result.value().version;
     engine->Load(load_opts);
     engines_[engine_name].engine = engine;
     return {};
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index bccc5f842..d5a4c4a6f 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -322,7 +322,7 @@ cpp::result<DownloadTask, std::string> ModelService::DownloadHfModelAsync(
     return cpp::fail("Please delete the model before downloading again");
 
   auto download_task = GetCloneRepoDownloadTask(
-      author_id, model_id, "main", {"huggingface.co", author_id, model_id},
+      author_id, model_id, "main", {kHuggingFaceHost, author_id, model_id},
       unique_model_id);
   if (download_task.has_error())
     return download_task;
@@ -617,13 +617,14 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
             inference_svc_->LoadModel(std::make_shared<Json::Value>(json_data));
 
         auto status_code = status["status_code"].asInt();
-        if (status == drogon::k200OK) {
-          return StartModelResult{true, "vLLM engine ignores all params override"};
-        } else if (status == drogon::k409Conflict) {
+        if (status_code == drogon::k200OK) {
+          return StartModelResult{true, ""};
+        } else if (status_code == drogon::k409Conflict) {
           CTL_INF("Model '" + model_handle + "' is already loaded");
-          return StartModelResult{.success = true, .warning = ""};
+          return StartModelResult{true, ""};
         } else {
-          return cpp::fail("Model failed to start: " + data["message"].asString());
+          return cpp::fail("Model failed to start: " +
+                           data["message"].asString());
         }
       }
 
@@ -789,17 +790,23 @@ cpp::result<bool, std::string> ModelService::StopModel(
                          bypass_stop_check_set_.end());
     std::string engine_name = "";
     if (!bypass_check) {
-      auto model_entry = db_service_->GetModelInfo(model_handle);
-      if (model_entry.has_error()) {
-        CTL_WRN("Error: " + model_entry.error());
-        return cpp::fail(model_entry.error());
+      auto result = db_service_->GetModelInfo(model_handle);
+      if (result.has_error()) {
+        CTL_WRN("Error: " + result.error());
+        return cpp::fail(result.error());
+      }
+
+      const auto model_entry = result.value();
+      if (model_entry.engine == kVllmEngine) {
+        engine_name = kVllmEngine;
+      } else {
+        yaml_handler.ModelConfigFromFile(
+            fmu::ToAbsoluteCortexDataPath(
+                fs::path(model_entry.path_to_model_yaml))
+                .string());
+        auto mc = yaml_handler.GetModelConfig();
+        engine_name = mc.engine;
       }
-      yaml_handler.ModelConfigFromFile(
-          fmu::ToAbsoluteCortexDataPath(
-              fs::path(model_entry.value().path_to_model_yaml))
-              .string());
-      auto mc = yaml_handler.GetModelConfig();
-      engine_name = mc.engine;
     }
     if (bypass_check) {
       engine_name = kLlamaEngine;

From 92261100143b80ff3d29f9a7b6d38eaa126d1af6 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 19 Mar 2025 17:33:48 +0800
Subject: [PATCH 61/73] retrieve cortex port from yaml file

---
 .../extensions/python-engines/vllm_engine.cc  | 34 ++++++-------------
 .../extensions/python-engines/vllm_engine.h   |  1 +
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 6dbda426b..42d38c489 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -5,33 +5,21 @@
 #include "utils/system_info_utils.h"
 
 namespace {
-// figure out port of current running process
-const constexpr int CORTEX_PORT = 3928;
-
 static std::pair<Json::Value, Json::Value> CreateResponse(
     const std::string& msg, int code) {
-
   Json::Value status, res;
-  const bool has_error = code != 200;
-
-  status["is_done"] = true;
-  status["has_error"] = has_error;
-  status["is_stream"] = false;
   status["status_code"] = code;
-
-  if (has_error) {
-    CTL_ERR(msg);
-    res["error"] = msg;
-  } else {
-    res["status"] = msg;
-  }
-
+  status["has_error"] = code != 200;
+  res["message"] = msg;
   return {status, res};
 }
 }  // namespace
 
-// cortex_port + 0 is always used (by cortex itself)
-VllmEngine::VllmEngine() : port_offsets_{true} {}
+VllmEngine::VllmEngine()
+    : cortex_port_{std::stoi(
+          file_manager_utils::GetCortexConfig().apiServerPort)},
+      port_offsets_{true}  // cortex_port + 0 is always used (by cortex itself)
+{}
 
 VllmEngine::~VllmEngine() {
   // NOTE: what happens if we can't kill subprocess?
@@ -108,7 +96,7 @@ void VllmEngine::LoadModel(
         // if model has exited, try to load model again?
         CTL_WRN("Model " << model << " has exited unexpectedly");
         model_process_map.erase(model);
-        port_offsets_[proc.port - CORTEX_PORT] = false;  // free the port
+        port_offsets_[proc.port - cortex_port_] = false;  // free the port
       }
     }
   }
@@ -137,7 +125,7 @@ void VllmEngine::LoadModel(
       if (!port_offsets_[offset])
         break;
     }
-    const int port = CORTEX_PORT + offset;
+    const int port = cortex_port_ + offset;
 
     // https://docs.astral.sh/uv/reference/cli/#uv-run
     // TODO: pass more args
@@ -212,7 +200,7 @@ void VllmEngine::UnloadModel(
     // after .IsAlive() and before .Kill() later.
     if (!proc.IsAlive()) {
       model_process_map.erase(model);
-      port_offsets_[proc.port - CORTEX_PORT] = false;  // free the port
+      port_offsets_[proc.port - cortex_port_] = false;  // free the port
 
       const std::string msg = "Model " + model + " stopped running.";
       auto [status, error] = CreateResponse(msg, 400);
@@ -229,7 +217,7 @@ void VllmEngine::UnloadModel(
     }
 
     model_process_map.erase(model);
-    port_offsets_[proc.port - CORTEX_PORT] = false;  // free the port
+    port_offsets_[proc.port - cortex_port_] = false;  // free the port
   }
 
   auto [status, res] = CreateResponse("Unload model successfully", 200);
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index c3e073aae..c41d7de4a 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -6,6 +6,7 @@
 class VllmEngine : public EngineI {
  private:
   std::string version_;
+  int cortex_port_;
 
   // port_offsets_[i] == true means cortex_port + i is used
   // otherwise, cortex_port + i is not used

From eeccd3a8adb4ec2d30824eb5a37f40cf21ec5bf1 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Mar 2025 11:19:48 +0800
Subject: [PATCH 62/73] add env vars support. log stdout and stderr

---
 .../extensions/python-engines/vllm_engine.cc  | 15 ++++-
 engine/services/engine_service.cc             | 10 +--
 engine/utils/process/utils.cc                 | 64 ++++++++++++++++++-
 engine/utils/process/utils.h                  |  7 +-
 4 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 42d38c489..23fa85ed6 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -1,4 +1,5 @@
 #include "vllm_engine.h"
+#include <fstream>
 #include "services/engine_service.h"
 #include "utils/curl_utils.h"
 #include "utils/logging_utils.h"
@@ -138,8 +139,20 @@ void VllmEngine::LoadModel(
     cmd.push_back(model_path.string());
     cmd.push_back("--port");
     cmd.push_back(std::to_string(port));
+    cmd.push_back("--served-model-name");
+    cmd.push_back(model);
 
-    auto result = cortex::process::SpawnProcess(cmd);
+    const auto stdout_file = env_dir / "stdout.log";
+    const auto stderr_file = env_dir / "stderr.log";
+
+    // create empty files for redirection
+    if (!std::filesystem::exists(stdout_file))
+      std::ofstream(stdout_file).flush();
+    if (!std::filesystem::exists(stderr_file))
+      std::ofstream(stderr_file).flush();
+
+    auto result = cortex::process::SpawnProcess(cmd, stdout_file.string(),
+                                                stderr_file.string());
     if (result.has_error()) {
       throw std::runtime_error(result.error());
     }
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 60d52846b..ec675ffde 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -406,16 +406,16 @@ cpp::result<void, std::string> EngineService::DownloadVllm(
     concrete_version = version_value.asString();
   }
   CTL_INF("Download vLLM " << concrete_version);
+  namespace fs = std::filesystem;
 
   const auto vllm_path =
       python_utils::GetEnvsPath() / "vllm" / concrete_version;
-  std::filesystem::create_directories(vllm_path);
-  const auto vllm_path_str = vllm_path.string();
+  fs::create_directories(vllm_path);
 
   // initialize venv
-  if (!std::filesystem::exists(vllm_path / ".venv")) {
+  if (!fs::exists(vllm_path / ".venv")) {
     std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("venv", vllm_path_str);
+        python_utils::BuildUvCommand("venv", vllm_path.string());
     cmd.push_back("--relocatable");
     auto result = cortex::process::SpawnProcess(cmd);
     if (result.has_error())
@@ -429,7 +429,7 @@ cpp::result<void, std::string> EngineService::DownloadVllm(
   // install vLLM
   {
     std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("pip", vllm_path_str);
+        python_utils::BuildUvCommand("pip", vllm_path.string());
     cmd.push_back("install");
     cmd.push_back("vllm==" + concrete_version);
     auto result = cortex::process::SpawnProcess(cmd);
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index 8cd0adc64..ac90b1c09 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -11,6 +11,44 @@ extern char** environ;  // environment variables
 #include <fcntl.h>
 #endif
 
+namespace {
+// retrieve current env vars, make a copy, then add new env vars from input
+std::vector<std::string> BuildEnvVars(
+    const std::unordered_map<std::string, std::string>& new_env_vars) {
+#if defined(_WIN32)
+  throw std::runtime_error("Not implemented");
+#endif
+
+  // parse current env var to an unordered map
+  std::unordered_map<std::string, std::string> env_vars_map;
+  for (int i = 0; environ[i] != nullptr; i++) {
+    std::string env_var{environ[i]};
+    auto split_idx = env_var.find("=");
+
+    if (split_idx == std::string::npos) {
+      throw std::runtime_error(
+          "Error while parsing current environment variables");
+    }
+
+    env_vars_map[env_var.substr(0, split_idx)] = env_var.substr(split_idx + 1);
+  }
+
+  // add new env vars. it will override existing env vars
+  for (const auto& [key, value] : new_env_vars) {
+    env_vars_map[key] = value;
+  }
+
+  // convert back to key=value format
+  std::vector<std::string> env_vars_vector;
+  for (const auto& [key, value] : env_vars_map) {
+    env_vars_vector.push_back(key + "=" + value);
+  }
+
+  return env_vars_vector;
+}
+
+}  // namespace
+
 namespace cortex::process {
 
 std::string ConstructWindowsCommandLine(const std::vector<std::string>& args) {
@@ -42,7 +80,10 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args) {
 
 cpp::result<ProcessInfo, std::string> SpawnProcess(
     const std::vector<std::string>& command, const std::string& stdout_file,
-    const std::string& stderr_file) {
+    const std::string& stderr_file,
+    std::optional<std::reference_wrapper<
+        const std::unordered_map<std::string, std::string>>>
+        env_vars) {
   std::stringstream ss;
   for (const auto item : command) {
     ss << item << " ";
@@ -191,6 +232,8 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
             posix_spawn_file_actions_destroy(action_ptr);
             throw std::runtime_error("Unable to add stdout to file action");
           }
+        } else {
+          CTL_WRN(stdout_file + " does not exist");
         }
       }
 
@@ -203,18 +246,33 @@ cpp::result<ProcessInfo, std::string> SpawnProcess(
             posix_spawn_file_actions_destroy(action_ptr);
             throw std::runtime_error("Unable to add stderr to file action");
           }
+        } else {
+          CTL_WRN(stderr_file + " does not exist");
         }
       }
     }
 
+    char** envp;
+    // we put these 2 here so that its lifetime lasts entire function
+    std::vector<std::string> env_vars_vector;
+    std::vector<char*> env_vars_;
+    if (env_vars.has_value()) {
+      env_vars_vector = BuildEnvVars(env_vars.value());
+      env_vars_ = ConvertToArgv(env_vars_vector);
+      envp = env_vars_.data();
+    } else {
+      envp = environ;  // simply inherit current env
+    }
+
     // Use posix_spawn for cross-platform compatibility
+    // NOTE: posix_spawn() returns after fork() step. it means that we may
+    // need to keep argv and envp data alive until exec() step finishes.
     auto spawn_result = posix_spawn(&pid,                // pid output
                                     command[0].c_str(),  // executable path
                                     action_ptr,          // file actions
                                     NULL,                // spawn attributes
                                     argv.data(),         // argument vector
-                                    environ  // environment (inherit)
-    );
+                                    envp);               // environment
 
     // NOTE: it seems like it's ok to destroy this immediately before
     // subprocess terminates.
diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h
index 19b821cef..db1ac7460 100644
--- a/engine/utils/process/utils.h
+++ b/engine/utils/process/utils.h
@@ -12,7 +12,9 @@ using pid_t = DWORD;
 #include <unistd.h>
 #endif
 
+#include <optional>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "utils/result.hpp"
 
@@ -36,7 +38,10 @@ std::vector<char*> ConvertToArgv(const std::vector<std::string>& args);
 
 cpp::result<ProcessInfo, std::string> SpawnProcess(
     const std::vector<std::string>& command,
-    const std::string& stdout_file = "", const std::string& stderr_file = "");
+    const std::string& stdout_file = "", const std::string& stderr_file = "",
+    std::optional<std::reference_wrapper<
+        const std::unordered_map<std::string, std::string>>>
+        env_vars = {});
 bool IsProcessAlive(ProcessInfo& proc_info);
 bool WaitProcess(ProcessInfo& proc_info);
 bool KillProcess(ProcessInfo& proc_info);

From 6fe7ae877433799aa9094e6b4d07f8aedb386434 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Mar 2025 11:41:02 +0800
Subject: [PATCH 63/73] add GetModelStatus and GetModels

---
 .../extensions/python-engines/vllm_engine.cc  | 86 ++++++++++++++++---
 1 file changed, 76 insertions(+), 10 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 23fa85ed6..fc5603281 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -129,9 +129,6 @@ void VllmEngine::LoadModel(
     const int port = cortex_port_ + offset;
 
     // https://docs.astral.sh/uv/reference/cli/#uv-run
-    // TODO: pass more args
-    // TOOD: figure out how to set env vars
-    // TOOD: set logging config
     std::vector<std::string> cmd =
         python_utils::BuildUvCommand("run", env_dir.string());
     cmd.push_back("vllm");
@@ -146,11 +143,13 @@ void VllmEngine::LoadModel(
     const auto stderr_file = env_dir / "stderr.log";
 
     // create empty files for redirection
+    // TODO: add limit on file size?
     if (!std::filesystem::exists(stdout_file))
       std::ofstream(stdout_file).flush();
     if (!std::filesystem::exists(stderr_file))
       std::ofstream(stderr_file).flush();
 
+    // TODO: may want to wait until model is ready i.e. health check endpoint
     auto result = cortex::process::SpawnProcess(cmd, stdout_file.string(),
                                                 stderr_file.string());
     if (result.has_error()) {
@@ -240,21 +239,89 @@ void VllmEngine::UnloadModel(
 void VllmEngine::GetModelStatus(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  CTL_WRN("Not implemented");
-  throw std::runtime_error("Not implemented");
+
+  if (!json_body->isMember("model")) {
+    auto [status, error] = CreateResponse("Missing required field: model", 400);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+  // check if model has started
+  {
+    std::shared_lock read_lock(mutex);
+    if (model_process_map.find(model) == model_process_map.end()) {
+      const std::string msg = "Model " + model + " has not been loaded yet.";
+      auto [status, error] = CreateResponse(msg, 400);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+  }
+
+  // we know that model has started
+  // TODO: just use health check endpoint
+  {
+    std::unique_lock write_lock(mutex);
+
+    // check if subprocess is still alive
+    if (!model_process_map[model].IsAlive()) {
+      CTL_WRN("Model " << model << " has exited unexpectedly.");
+      model_process_map.erase(model);
+      const std::string msg = "Model " + model + " stopped running.";
+      auto [status, error] = CreateResponse(msg, 400);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+  }
+
+  Json::Value res, status;
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = 200;
+  callback(std::move(status), std::move(res));
 };
 
-// For backward compatible checking
 bool VllmEngine::IsSupported(const std::string& f) {
   return true;
 };
 
-// Get list of running models
 void VllmEngine::GetModels(
     std::shared_ptr<Json::Value> jsonBody,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  CTL_WRN("Not implemented");
-  throw std::runtime_error("Not implemented");
+  Json::Value res, model_list(Json::arrayValue), status;
+  {
+    std::unique_lock write_lock(mutex);
+    for (auto& [model_name, py_proc] : model_process_map) {
+      // TODO: check using health endpoint
+      if (!py_proc.IsAlive()) {
+        CTL_WRN("Model " << model_name << " has exited unexpectedly.");
+        model_process_map.erase(model_name);
+        continue;
+      }
+
+      Json::Value val;
+      val["id"] = model_name;
+      val["engine"] = kVllmEngine;
+      val["start_time"] = py_proc.start_time;
+      val["port"] = py_proc.port;
+      val["object"] = "model";
+      // TODO
+      // val["ram"];
+      // val["vram"];
+      model_list.append(val);
+    }
+  }
+
+  res["object"] = "list";
+  res["data"] = model_list;
+
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = 200;
+
+  callback(std::move(status), std::move(res));
 };
 
 bool VllmEngine::SetFileLogger(int max_log_lines, const std::string& log_path) {
@@ -266,7 +333,6 @@ void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {
   throw std::runtime_error("Not implemented");
 };
 
-// Stop inflight chat completion in stream mode
 void VllmEngine::StopInferencing(const std::string& model_id) {
   CTL_WRN("Not implemented");
   throw std::runtime_error("Not implemented");

From 074a04a003bb2836841b1baf1c83549e4846c44e Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Thu, 20 Mar 2025 11:54:01 +0800
Subject: [PATCH 64/73] fix typo

---
 engine/services/engine_service.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index ec675ffde..c80925616 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -680,7 +680,7 @@ EngineService::GetDefaultEngineVariant(const std::string& engine) {
 
   auto config = file_manager_utils::GetCortexConfig();
   std::string variant, version;
-  if (engine == kLlamaEngine) {
+  if (engine == kLlamaRepo) {
     variant = config.llamacppVariant;
     version = config.llamacppVersion;
     if (variant.empty() || version.empty())

From 368a4f3188d3375cf76929b0eb87aaa7f4dbfc56 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 21 Mar 2025 16:35:45 +0800
Subject: [PATCH 65/73] add non-stream chat completions

---
 .../extensions/python-engines/vllm_engine.cc  | 97 +++++++++++++------
 .../extensions/python-engines/vllm_engine.h   |  4 +-
 engine/services/model_service.cc              |  4 +
 3 files changed, 76 insertions(+), 29 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index fc5603281..3946d2717 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -24,8 +24,8 @@ VllmEngine::VllmEngine()
 
 VllmEngine::~VllmEngine() {
   // NOTE: what happens if we can't kill subprocess?
-  std::unique_lock write_lock(mutex);
-  for (auto& [model_name, py_proc] : model_process_map) {
+  std::unique_lock write_lock(mutex_);
+  for (auto& [model_name, py_proc] : model_process_map_) {
     if (py_proc.IsAlive())
       py_proc.Kill();
   }
@@ -60,15 +60,58 @@ void VllmEngine::Unload(EngineUnloadOption opts) {};
 void VllmEngine::HandleChatCompletion(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  CTL_WRN("Not implemented");
-  throw std::runtime_error("Not implemented");
+
+  // request validation should be in controller
+  if (!json_body->isMember("model")) {
+    auto [status, error] =
+        CreateResponse("Missing required fields: model", 400);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+  int port;
+  // check if model has started
+  // TODO: use health check instead
+  {
+    std::shared_lock read_lock(mutex_);
+    if (model_process_map_.find(model) == model_process_map_.end()) {
+      const std::string msg = "Model " + model + " has not been loaded yet.";
+      auto [status, error] = CreateResponse(msg, 400);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+    port = model_process_map_[model].port;
+  }
+
+  bool stream = (*json_body)["stream"].asBool();
+  if (stream) {
+    auto [status, res] = CreateResponse("stream=true is not yet supported", 400);
+    callback(std::move(status), std::move(res));
+  } else {
+    const std::string url =
+        "http://127.0.0.1:" + std::to_string(port) + "/v1/chat/completions";
+    auto result = curl_utils::SimplePostJson(url, json_body->toStyledString());
+
+    if (result.has_error()) {
+      auto [status, res] = CreateResponse(result.error(), 400);
+      callback(std::move(status), std::move(res));
+    }
+
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = false;
+    status["is_stream"] = false;
+    status["status_code"] = 200;
+    callback(std::move(status), std::move(result.value()));
+  }
 };
 
 void VllmEngine::HandleEmbedding(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  CTL_WRN("Not implemented");
-  throw std::runtime_error("Not implemented");
+  auto [status, res] = CreateResponse("embedding is not yet supported", 400);
+  callback(std::move(status), std::move(res));
 };
 
 void VllmEngine::LoadModel(
@@ -85,9 +128,9 @@ void VllmEngine::LoadModel(
   const std::string model = (*json_body)["model"].asString();
 
   {
-    std::unique_lock write_lock(mutex);
-    if (model_process_map.find(model) != model_process_map.end()) {
-      auto proc = model_process_map[model];
+    std::unique_lock write_lock(mutex_);
+    if (model_process_map_.find(model) != model_process_map_.end()) {
+      auto proc = model_process_map_[model];
 
       if (proc.IsAlive()) {
         auto [status, error] = CreateResponse("Model already loaded!", 409);
@@ -96,7 +139,7 @@ void VllmEngine::LoadModel(
       } else {
         // if model has exited, try to load model again?
         CTL_WRN("Model " << model << " has exited unexpectedly");
-        model_process_map.erase(model);
+        model_process_map_.erase(model);
         port_offsets_[proc.port - cortex_port_] = false;  // free the port
       }
     }
@@ -164,8 +207,8 @@ void VllmEngine::LoadModel(
 
     pid = py_proc.proc_info.pid;
 
-    std::unique_lock write_lock(mutex);
-    model_process_map[model] = py_proc;
+    std::unique_lock write_lock(mutex_);
+    model_process_map_[model] = py_proc;
 
   } catch (const std::exception& e) {
     auto e_msg = e.what();
@@ -192,8 +235,8 @@ void VllmEngine::UnloadModel(
 
   // check if model has started
   {
-    std::shared_lock read_lock(mutex);
-    if (model_process_map.find(model) == model_process_map.end()) {
+    std::shared_lock read_lock(mutex_);
+    if (model_process_map_.find(model) == model_process_map_.end()) {
       const std::string msg = "Model " + model + " has not been loaded yet.";
       auto [status, error] = CreateResponse(msg, 400);
       callback(std::move(status), std::move(error));
@@ -203,15 +246,15 @@ void VllmEngine::UnloadModel(
 
   // we know that model has started
   {
-    std::unique_lock write_lock(mutex);
-    auto proc = model_process_map[model];
+    std::unique_lock write_lock(mutex_);
+    auto proc = model_process_map_[model];
 
     // TODO: we can use vLLM health check endpoint
     // check if subprocess is still alive
     // NOTE: is this step necessary? the subprocess could have terminated
     // after .IsAlive() and before .Kill() later.
     if (!proc.IsAlive()) {
-      model_process_map.erase(model);
+      model_process_map_.erase(model);
       port_offsets_[proc.port - cortex_port_] = false;  // free the port
 
       const std::string msg = "Model " + model + " stopped running.";
@@ -221,14 +264,14 @@ void VllmEngine::UnloadModel(
     }
 
     // subprocess is alive. we kill it here.
-    if (!model_process_map[model].Kill()) {
+    if (!model_process_map_[model].Kill()) {
       const std::string msg = "Unable to kill process of model " + model;
       auto [status, error] = CreateResponse(msg, 500);
       callback(std::move(status), std::move(error));
       return;
     }
 
-    model_process_map.erase(model);
+    model_process_map_.erase(model);
     port_offsets_[proc.port - cortex_port_] = false;  // free the port
   }
 
@@ -249,8 +292,8 @@ void VllmEngine::GetModelStatus(
   const std::string model = (*json_body)["model"].asString();
   // check if model has started
   {
-    std::shared_lock read_lock(mutex);
-    if (model_process_map.find(model) == model_process_map.end()) {
+    std::shared_lock read_lock(mutex_);
+    if (model_process_map_.find(model) == model_process_map_.end()) {
       const std::string msg = "Model " + model + " has not been loaded yet.";
       auto [status, error] = CreateResponse(msg, 400);
       callback(std::move(status), std::move(error));
@@ -261,12 +304,12 @@ void VllmEngine::GetModelStatus(
   // we know that model has started
   // TODO: just use health check endpoint
   {
-    std::unique_lock write_lock(mutex);
+    std::unique_lock write_lock(mutex_);
 
     // check if subprocess is still alive
-    if (!model_process_map[model].IsAlive()) {
+    if (!model_process_map_[model].IsAlive()) {
       CTL_WRN("Model " << model << " has exited unexpectedly.");
-      model_process_map.erase(model);
+      model_process_map_.erase(model);
       const std::string msg = "Model " + model + " stopped running.";
       auto [status, error] = CreateResponse(msg, 400);
       callback(std::move(status), std::move(error));
@@ -291,12 +334,12 @@ void VllmEngine::GetModels(
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
   Json::Value res, model_list(Json::arrayValue), status;
   {
-    std::unique_lock write_lock(mutex);
-    for (auto& [model_name, py_proc] : model_process_map) {
+    std::unique_lock write_lock(mutex_);
+    for (auto& [model_name, py_proc] : model_process_map_) {
       // TODO: check using health endpoint
       if (!py_proc.IsAlive()) {
         CTL_WRN("Model " << model_name << " has exited unexpectedly.");
-        model_process_map.erase(model_name);
+        model_process_map_.erase(model_name);
         continue;
       }
 
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index c41d7de4a..b13255fe3 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -12,9 +12,9 @@ class VllmEngine : public EngineI {
   // otherwise, cortex_port + i is not used
   std::vector<bool> port_offsets_;
 
-  mutable std::shared_mutex mutex;
+  mutable std::shared_mutex mutex_;
   std::unordered_map<std::string, python_utils::PythonSubprocess>
-      model_process_map;
+      model_process_map_;
 
  public:
   VllmEngine();
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index c75ed7504..accc9787e 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -1252,6 +1252,10 @@ std::string ModelService::GetEngineByModelId(
     CTL_WRN("Error: " + model_entry.error());
     return "";
   }
+
+  if (model_entry.value().engine == kVllmEngine)
+    return kVllmEngine;
+
   config::YamlHandler yaml_handler;
   yaml_handler.ModelConfigFromFile(
       fmu::ToAbsoluteCortexDataPath(

From 807b201831a845d24f86579b402c7b98f625d441 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 1 Apr 2025 19:45:21 +0800
Subject: [PATCH 66/73] add uninstall cmd

---
 .../extensions/python-engines/python_utils.cc | 45 ++++++-------------
 .../extensions/python-engines/python_utils.h  |  9 ++--
 engine/services/engine_service.cc             | 20 ++++++---
 3 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc
index 07297801e..005c36b7c 100644
--- a/engine/extensions/python-engines/python_utils.cc
+++ b/engine/extensions/python-engines/python_utils.cc
@@ -20,11 +20,21 @@ std::filesystem::path GetUvPath() {
   const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
   return GetPythonEnginesPath() / "bin" / bin_name;
 }
+bool UvCleanCache() {
+  auto cmd = UvBuildCommand("cache");
+  cmd.push_back("clean");
+  auto result = cortex::process::SpawnProcess(cmd);
+  if (result.has_error()) {
+    CTL_INF(result.error());
+    return false;
+  }
+  return cortex::process::WaitProcess(result.value());
+}
 
-bool IsUvInstalled() {
+bool UvIsInstalled() {
   return std::filesystem::exists(GetUvPath());
 }
-cpp::result<void, std::string> InstallUv() {
+cpp::result<void, std::string> UvInstall() {
   const auto py_bin_path = GetPythonEnginesPath() / "bin";
   std::filesystem::create_directories(py_bin_path);
 
@@ -75,7 +85,7 @@ cpp::result<void, std::string> InstallUv() {
   // this Python installation.
   // we can add this once we allow passing custom env var to SpawnProcess().
   // https://docs.astral.sh/uv/reference/cli/#uv-python-install
-  std::vector<std::string> command = BuildUvCommand("python");
+  std::vector<std::string> command = UvBuildCommand("python");
   command.push_back("install");
   command.push_back("3.10");
 
@@ -92,7 +102,7 @@ cpp::result<void, std::string> InstallUv() {
   return {};
 }
 
-std::vector<std::string> BuildUvCommand(const std::string& action,
+std::vector<std::string> UvBuildCommand(const std::string& action,
                                         const std::string& directory) {
   // use our own cache dir so that when users delete cortexcpp/, everything is deleted.
   const auto cache_dir = GetPythonEnginesPath() / "cache" / "uv";
@@ -106,31 +116,4 @@ std::vector<std::string> BuildUvCommand(const std::string& action,
   return command;
 }
 
-// cpp::result<void, std::string> UvDownloadDeps(
-//     const std::filesystem::path& model_dir) {
-//   if (!IsUvInstalled())
-//     return cpp::fail(
-//         "uv is not installed. Please run `cortex engines install python`.");
-
-//   std::vector<std::string> command = BuildUvCommand("sync", model_dir.string());
-
-//   // script mode. 1st argument is path to .py script
-//   if (!std::filesystem::exists(model_dir / "pyproject.toml")) {
-//     config::PythonModelConfig py_cfg;
-//     py_cfg.ReadFromYaml((model_dir / "model.yml").string());
-//     command.push_back("--script");
-//     command.push_back(py_cfg.entrypoint[0]);
-//   }
-
-//   auto result = cortex::process::SpawnProcess(command);
-//   if (result.has_error())
-//     return cpp::fail("Fail to install Python dependencies. " + result.error());
-
-//   if (!cortex::process::WaitProcess(result.value())) {
-//     return cpp::fail("Fail to install Python dependencies.");
-//   }
-
-//   return {};
-// }
-
 }  // namespace python_utils
diff --git a/engine/extensions/python-engines/python_utils.h b/engine/extensions/python-engines/python_utils.h
index 97b2d3f15..5206eb7f1 100644
--- a/engine/extensions/python-engines/python_utils.h
+++ b/engine/extensions/python-engines/python_utils.h
@@ -14,12 +14,11 @@ std::filesystem::path GetEnvsPath();
 std::filesystem::path GetUvPath();
 
 // UV-related functions
-bool IsUvInstalled();
-cpp::result<void, std::string> InstallUv();
-std::vector<std::string> BuildUvCommand(const std::string& action,
+bool UvIsInstalled();
+cpp::result<void, std::string> UvInstall();
+std::vector<std::string> UvBuildCommand(const std::string& action,
                                         const std::string& directory = "");
-// cpp::result<void, std::string> UvDownloadDeps(
-//     const std::filesystem::path& yaml_path);
+bool UvCleanCache();
 
 struct PythonSubprocess {
   cortex::process::ProcessInfo proc_info;
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index e0205919f..4da119c3d 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -201,6 +201,16 @@ cpp::result<bool, std::string> EngineService::UninstallEngineVariant(
     } else {
       return cpp::fail("No variant provided");
     }
+  } else if (ne == kVllmEngine) {
+    // variant is ignored for vLLM
+    if (version == std::nullopt) {
+      path_to_remove = python_utils::GetEnvsPath() / "vllm";
+
+      // we only clean uv cache when all vLLM versions are deleted
+      python_utils::UvCleanCache();
+    } else {
+      path_to_remove = python_utils::GetEnvsPath() / "vllm" / version.value();
+    }
   } else {
     return cpp::fail("Not implemented for engine " + ne);
   }
@@ -394,8 +404,8 @@ cpp::result<void, std::string> EngineService::DownloadVllm(
   // NOTE: everything below is not async
   // to make it async, we have to run everything in a thread (spawning and waiting
   // for subprocesses)
-  if (!python_utils::IsUvInstalled()) {
-    auto result = python_utils::InstallUv();
+  if (!python_utils::UvIsInstalled()) {
+    auto result = python_utils::UvInstall();
     if (result.has_error())
       return result;
   }
@@ -421,21 +431,20 @@ cpp::result<void, std::string> EngineService::DownloadVllm(
   // initialize venv
   if (!fs::exists(vllm_path / ".venv")) {
     std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("venv", vllm_path.string());
+        python_utils::UvBuildCommand("venv", vllm_path.string());
     cmd.push_back("--relocatable");
     auto result = cortex::process::SpawnProcess(cmd);
     if (result.has_error())
       return cpp::fail(result.error());
 
     // TODO: check return code
-    // NOTE: these are not async
     cortex::process::WaitProcess(result.value());
   }
 
   // install vLLM
   {
     std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("pip", vllm_path.string());
+        python_utils::UvBuildCommand("pip", vllm_path.string());
     cmd.push_back("install");
     cmd.push_back("vllm==" + concrete_version);
     auto result = cortex::process::SpawnProcess(cmd);
@@ -444,7 +453,6 @@ cpp::result<void, std::string> EngineService::DownloadVllm(
 
     // TODO: check return code
     // one reason this may fail is that the requested version does not exist
-    // NOTE: these are not async
     cortex::process::WaitProcess(result.value());
   }
 

From d38eca8a47d295a4f6c640d2dc942c0866ec66cd Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 1 Apr 2025 21:35:27 +0800
Subject: [PATCH 67/73] support streaming

---
 .../extensions/python-engines/vllm_engine.cc  | 98 +++++++++++++++++--
 .../extensions/python-engines/vllm_engine.h   |  4 +
 engine/services/engine_service.cc             |  1 +
 3 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 3946d2717..9564c13a4 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -14,13 +14,47 @@ static std::pair<Json::Value, Json::Value> CreateResponse(
   res["message"] = msg;
   return {status, res};
 }
+
+// this is mostly copied from local_engine.cc
+struct StreamContext {
+  std::shared_ptr<std::function<void(Json::Value&&, Json::Value&&)>> callback;
+  bool need_stop;
+
+  static size_t write_callback(char* ptr, size_t size, size_t nmemb,
+                               void* userdata) {
+    auto* ctx = static_cast<StreamContext*>(userdata);
+    size_t data_length = size * nmemb;
+    if (data_length <= 6)
+      return data_length;
+
+    std::string chunk{ptr, data_length};
+    CTL_INF(chunk);
+    Json::Value status;
+    status["is_stream"] = true;
+    status["has_error"] = false;
+    status["status_code"] = 200;
+    Json::Value chunk_json;
+    chunk_json["data"] = chunk;
+
+    if (chunk.find("[DONE]") != std::string::npos) {
+      status["is_done"] = true;
+      ctx->need_stop = false;
+    } else {
+      status["is_done"] = false;
+    }
+
+    (*ctx->callback)(std::move(status), std::move(chunk_json));
+    return data_length;
+  };
+};
+
 }  // namespace
 
 VllmEngine::VllmEngine()
     : cortex_port_{std::stoi(
           file_manager_utils::GetCortexConfig().apiServerPort)},
-      port_offsets_{true}  // cortex_port + 0 is always used (by cortex itself)
-{}
+      port_offsets_{true},  // cortex_port + 0 is always used (by cortex itself)
+      queue_{2 /* threadNum */, "vLLM engine"} {}
 
 VllmEngine::~VllmEngine() {
   // NOTE: what happens if we can't kill subprocess?
@@ -84,14 +118,62 @@ void VllmEngine::HandleChatCompletion(
     port = model_process_map_[model].port;
   }
 
+  const std::string url =
+      "http://127.0.0.1:" + std::to_string(port) + "/v1/chat/completions";
+  const std::string json_str = json_body->toStyledString();
+
   bool stream = (*json_body)["stream"].asBool();
   if (stream) {
-    auto [status, res] = CreateResponse("stream=true is not yet supported", 400);
-    callback(std::move(status), std::move(res));
+    queue_.runTaskInQueue([url = std::move(url), json_str = std::move(json_str),
+                           callback = std::move(callback)] {
+      CURL* curl = curl_easy_init();
+      if (!curl) {
+        auto [status, res] = CreateResponse("Internal server error", 500);
+        callback(std::move(status), std::move(res));
+      }
+
+      struct curl_slist* headers = nullptr;
+      headers = curl_slist_append(headers, "Content-Type: application/json");
+
+      curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+      curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+      curl_easy_setopt(curl, CURLOPT_POST, 1L);
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str());
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length());
+      curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L);
+
+      StreamContext ctx;
+      ctx.callback =
+          std::make_shared<std::function<void(Json::Value&&, Json::Value&&)>>(
+              callback);
+      ctx.need_stop = true;
+      curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION,
+                       StreamContext::write_callback);
+      curl_easy_setopt(curl, CURLOPT_WRITEDATA, &ctx);
+
+      CURLcode res = curl_easy_perform(curl);
+      if (res != CURLE_OK) {
+        auto msg = curl_easy_strerror(res);
+        auto [status, res] = CreateResponse(msg, 500);
+        callback(std::move(status), std::move(res));
+      }
+
+      curl_slist_free_all(headers);
+      curl_easy_cleanup(curl);
+      if (ctx.need_stop) {
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = false;
+        status["is_stream"] = true;
+        status["status_code"] = 200;
+        callback(std::move(status), Json::Value{});
+      }
+
+      return;
+    });
   } else {
-    const std::string url =
-        "http://127.0.0.1:" + std::to_string(port) + "/v1/chat/completions";
-    auto result = curl_utils::SimplePostJson(url, json_body->toStyledString());
+    // non-streaming
+    auto result = curl_utils::SimplePostJson(url, json_str);
 
     if (result.has_error()) {
       auto [status, res] = CreateResponse(result.error(), 400);
@@ -173,7 +255,7 @@ void VllmEngine::LoadModel(
 
     // https://docs.astral.sh/uv/reference/cli/#uv-run
     std::vector<std::string> cmd =
-        python_utils::BuildUvCommand("run", env_dir.string());
+        python_utils::UvBuildCommand("run", env_dir.string());
     cmd.push_back("vllm");
     cmd.push_back("serve");
     cmd.push_back(model_path.string());
diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h
index b13255fe3..d7724b703 100644
--- a/engine/extensions/python-engines/vllm_engine.h
+++ b/engine/extensions/python-engines/vllm_engine.h
@@ -2,6 +2,7 @@
 #include "common/engine_servicei.h"
 #include "cortex-common/EngineI.h"
 #include "python_utils.h"
+#include "trantor/utils/ConcurrentTaskQueue.h"
 
 class VllmEngine : public EngineI {
  private:
@@ -16,6 +17,9 @@ class VllmEngine : public EngineI {
   std::unordered_map<std::string, python_utils::PythonSubprocess>
       model_process_map_;
 
+  // TODO: will use cortex's main TaskQueue once llama.cpp PR is merged
+  trantor::ConcurrentTaskQueue queue_;
+
  public:
   VllmEngine();
   ~VllmEngine();
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 4da119c3d..9df6b74a2 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -433,6 +433,7 @@ cpp::result<void, std::string> EngineService::DownloadVllm(
     std::vector<std::string> cmd =
         python_utils::UvBuildCommand("venv", vllm_path.string());
     cmd.push_back("--relocatable");
+    cmd.push_back("--seed");
     auto result = cortex::process::SpawnProcess(cmd);
     if (result.has_error())
       return cpp::fail(result.error());

From 7e002cd4a11b3d2572d819194e76b58996ba9aab Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 1 Apr 2025 21:52:04 +0800
Subject: [PATCH 68/73] fix cortex run

---
 engine/cli/commands/chat_completion_cmd.cc |  6 +++++-
 engine/cli/commands/run_cmd.cc             | 17 ++++++++++++-----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/engine/cli/commands/chat_completion_cmd.cc b/engine/cli/commands/chat_completion_cmd.cc
index 77ee4fca3..6b52464f3 100644
--- a/engine/cli/commands/chat_completion_cmd.cc
+++ b/engine/cli/commands/chat_completion_cmd.cc
@@ -137,7 +137,11 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
       new_data["content"] = user_input;
       histories_.push_back(std::move(new_data));
 
-      Json::Value json_data = mc.ToJson();
+      // vLLM doesn't support params used model config
+      Json::Value json_data;
+      if (mc.engine != kVllmEngine) {
+        json_data = mc.ToJson();
+      }
       json_data["engine"] = mc.engine;
 
       Json::Value msgs_array(Json::arrayValue);
diff --git a/engine/cli/commands/run_cmd.cc b/engine/cli/commands/run_cmd.cc
index c01d3d806..25f3ae45d 100644
--- a/engine/cli/commands/run_cmd.cc
+++ b/engine/cli/commands/run_cmd.cc
@@ -84,11 +84,18 @@ void RunCmd::Exec(bool run_detach,
       CLI_LOG("Error: " + model_entry.error());
       return;
     }
-    yaml_handler.ModelConfigFromFile(
-        fmu::ToAbsoluteCortexDataPath(
-            fs::path(model_entry.value().path_to_model_yaml))
-            .string());
-    auto mc = yaml_handler.GetModelConfig();
+
+    config::ModelConfig mc;
+    if (model_entry.value().engine == kVllmEngine) {
+      // vLLM engine doesn't have model config
+      mc.engine = kVllmEngine;
+    } else {
+      yaml_handler.ModelConfigFromFile(
+          fmu::ToAbsoluteCortexDataPath(
+              fs::path(model_entry.value().path_to_model_yaml))
+              .string());
+      mc = yaml_handler.GetModelConfig();
+    }
 
     // Check if engine existed. If not, download it
     {

From 1ebbbdb8cb0aaa70c14f6a644716a5b924fdc192 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 20:31:49 +0800
Subject: [PATCH 69/73] wait for vLLM server to be up

---
 .../extensions/python-engines/vllm_engine.cc   | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 9564c13a4..e8192e569 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -280,15 +280,27 @@ void VllmEngine::LoadModel(
     if (result.has_error()) {
       throw std::runtime_error(result.error());
     }
+    auto proc_info = result.value();
+    pid = proc_info.pid;
+
+    // wait for server to be up
+    while (true) {
+      CTL_INF("Wait for vLLM server to be up. Sleep for 5s");
+      std::this_thread::sleep_for(std::chrono::seconds(5));
+      if (!cortex::process::IsProcessAlive(proc_info))
+        throw std::runtime_error("vLLM subprocess fails to start");
+
+      const auto url = "http://127.0.0.1:" + std::to_string(port) + "/health";
+      if (curl_utils::SimpleGet(url).has_value())
+        break;
+    }
 
     python_utils::PythonSubprocess py_proc;
-    py_proc.proc_info = result.value();
+    py_proc.proc_info = proc_info;
     py_proc.port = port;
     py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() /
                          std::chrono::milliseconds(1);
 
-    pid = py_proc.proc_info.pid;
-
     std::unique_lock write_lock(mutex_);
     model_process_map_[model] = py_proc;
 

From b5d83156cc63b5f44a0ebc5f57ae1f709de86fda Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 20:45:52 +0800
Subject: [PATCH 70/73] use health check for some stuff

---
 .../extensions/python-engines/vllm_engine.cc  | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index e8192e569..3cc30e37f 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -106,7 +106,6 @@ void VllmEngine::HandleChatCompletion(
   const std::string model = (*json_body)["model"].asString();
   int port;
   // check if model has started
-  // TODO: use health check instead
   {
     std::shared_lock read_lock(mutex_);
     if (model_process_map_.find(model) == model_process_map_.end()) {
@@ -274,7 +273,6 @@ void VllmEngine::LoadModel(
     if (!std::filesystem::exists(stderr_file))
       std::ofstream(stderr_file).flush();
 
-    // TODO: may want to wait until model is ready i.e. health check endpoint
     auto result = cortex::process::SpawnProcess(cmd, stdout_file.string(),
                                                 stderr_file.string());
     if (result.has_error()) {
@@ -284,6 +282,7 @@ void VllmEngine::LoadModel(
     pid = proc_info.pid;
 
     // wait for server to be up
+    // NOTE: should we add a timeout to avoid endless loop?
     while (true) {
       CTL_INF("Wait for vLLM server to be up. Sleep for 5s");
       std::this_thread::sleep_for(std::chrono::seconds(5));
@@ -343,7 +342,6 @@ void VllmEngine::UnloadModel(
     std::unique_lock write_lock(mutex_);
     auto proc = model_process_map_[model];
 
-    // TODO: we can use vLLM health check endpoint
     // check if subprocess is still alive
     // NOTE: is this step necessary? the subprocess could have terminated
     // after .IsAlive() and before .Kill() later.
@@ -396,27 +394,32 @@ void VllmEngine::GetModelStatus(
   }
 
   // we know that model has started
-  // TODO: just use health check endpoint
   {
     std::unique_lock write_lock(mutex_);
+    auto py_proc = model_process_map_[model];
+
+    // health check endpoint
+    const auto url =
+        "http://127.0.0.1:" + std::to_string(py_proc.port) + "/health";
+    if (curl_utils::SimpleGet(url).has_value()) {
+      Json::Value status;
+      status["is_done"] = true;
+      status["has_error"] = false;
+      status["is_stream"] = false;
+      status["status_code"] = 200;
+      callback(std::move(status), Json::Value{});
+    } else {
+      // try to kill the subprocess to free resources, in case the server hangs
+      // instead of subprocess has died.
+      py_proc.Kill();
 
-    // check if subprocess is still alive
-    if (!model_process_map_[model].IsAlive()) {
       CTL_WRN("Model " << model << " has exited unexpectedly.");
       model_process_map_.erase(model);
       const std::string msg = "Model " + model + " stopped running.";
       auto [status, error] = CreateResponse(msg, 400);
       callback(std::move(status), std::move(error));
-      return;
     }
   }
-
-  Json::Value res, status;
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = 200;
-  callback(std::move(status), std::move(res));
 };
 
 bool VllmEngine::IsSupported(const std::string& f) {
@@ -430,8 +433,13 @@ void VllmEngine::GetModels(
   {
     std::unique_lock write_lock(mutex_);
     for (auto& [model_name, py_proc] : model_process_map_) {
-      // TODO: check using health endpoint
-      if (!py_proc.IsAlive()) {
+      const auto url =
+          "http://127.0.0.1:" + std::to_string(py_proc.port) + "/health";
+      if (curl_utils::SimpleGet(url).has_error()) {
+        // try to kill the subprocess to free resources, in case the server hangs
+        // instead of subprocess has died.
+        py_proc.Kill();
+
         CTL_WRN("Model " << model_name << " has exited unexpectedly.");
         model_process_map_.erase(model_name);
         continue;

From 5feda51361d7cfe0baab558e0b8582cd72fe5ce3 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 21:19:48 +0800
Subject: [PATCH 71/73] add some notes. support embeddings. support some extra
 vLLM args

---
 .../extensions/python-engines/vllm_engine.cc  | 65 ++++++++++++++++++-
 engine/services/inference_service.cc          |  4 +-
 2 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 3cc30e37f..5bdab068a 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -1,3 +1,9 @@
+// Note on subprocess lifecycle
+// In LoadModel(), we will wait until /health returns 200. Thus, in subsequent
+// calls to the subprocess, if the server is working normally, /health is
+// guaranteed to return 200. If it doesn't, it either means the subprocess has
+// died or the server hangs (for whatever reason).
+
 #include "vllm_engine.h"
 #include <fstream>
 #include "services/engine_service.h"
@@ -82,6 +88,7 @@ std::vector<EngineVariantResponse> VllmEngine::GetVariants() {
   return variants;
 }
 
+// TODO: once llama-server is merged, check if checking 'v' is still needed
 void VllmEngine::Load(EngineLoadOption opts) {
   version_ = opts.engine_path;  // engine path actually contains version info
   if (version_[0] == 'v')
@@ -95,7 +102,7 @@ void VllmEngine::HandleChatCompletion(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
 
-  // request validation should be in controller
+  // NOTE: request validation should be in controller
   if (!json_body->isMember("model")) {
     auto [status, error] =
         CreateResponse("Missing required fields: model", 400);
@@ -188,11 +195,49 @@ void VllmEngine::HandleChatCompletion(
   }
 };
 
+// NOTE: we don't have an option to pass --task embed to vLLM spawn yet
 void VllmEngine::HandleEmbedding(
     std::shared_ptr<Json::Value> json_body,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  auto [status, res] = CreateResponse("embedding is not yet supported", 400);
-  callback(std::move(status), std::move(res));
+
+  if (!json_body->isMember("model")) {
+    auto [status, error] =
+        CreateResponse("Missing required fields: model", 400);
+    callback(std::move(status), std::move(error));
+    return;
+  }
+
+  const std::string model = (*json_body)["model"].asString();
+  int port;
+  // check if model has started
+  {
+    std::shared_lock read_lock(mutex_);
+    if (model_process_map_.find(model) == model_process_map_.end()) {
+      const std::string msg = "Model " + model + " has not been loaded yet.";
+      auto [status, error] = CreateResponse(msg, 400);
+      callback(std::move(status), std::move(error));
+      return;
+    }
+    port = model_process_map_[model].port;
+  }
+
+  const std::string url =
+      "http://127.0.0.1:" + std::to_string(port) + "/v1/embeddings";
+  const std::string json_str = json_body->toStyledString();
+
+  auto result = curl_utils::SimplePostJson(url, json_str);
+
+  if (result.has_error()) {
+    auto [status, res] = CreateResponse(result.error(), 400);
+    callback(std::move(status), std::move(res));
+  }
+
+  Json::Value status;
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = 200;
+  callback(std::move(status), std::move(result.value()));
 };
 
 void VllmEngine::LoadModel(
@@ -213,6 +258,10 @@ void VllmEngine::LoadModel(
     if (model_process_map_.find(model) != model_process_map_.end()) {
       auto proc = model_process_map_[model];
 
+      // NOTE: each vLLM instance can only serve 1 task. It means that the
+      // following logic will not allow serving the same model for 2 different
+      // tasks at the same time.
+      // To support it, we also need to know how vLLM decides the default task.
       if (proc.IsAlive()) {
         auto [status, error] = CreateResponse("Model already loaded!", 409);
         callback(std::move(status), std::move(error));
@@ -263,6 +312,16 @@ void VllmEngine::LoadModel(
     cmd.push_back("--served-model-name");
     cmd.push_back(model);
 
+    // NOTE: we might want to adjust max-model-len automatically, since vLLM
+    // may OOM for large models as it tries to allocate full context length.
+    const std::string EXTRA_ARGS[] = {"task", "max-model-len"};
+    for (const auto arg : EXTRA_ARGS) {
+      if (json_body->isMember(arg)) {
+        cmd.push_back("--" + arg);
+        cmd.push_back((*json_body)[arg].asString());
+      }
+    }
+
     const auto stdout_file = env_dir / "stdout.log";
     const auto stderr_file = env_dir / "stderr.log";
 
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index bd79a6ce5..f1d38e76a 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -119,7 +119,9 @@ cpp::result<void, InferResult> InferenceService::HandleEmbedding(
     std::shared_ptr<SyncQueue> q, std::shared_ptr<Json::Value> json_body) {
   std::string engine_type;
   if (!HasFieldInReq(json_body, "engine")) {
-    engine_type = kLlamaRepo;
+    auto engine_type_maybe =
+        GetEngineByModelId((*json_body)["model"].asString());
+    engine_type = engine_type_maybe.empty() ? kLlamaRepo : engine_type_maybe;
   } else {
     engine_type = (*(json_body)).get("engine", kLlamaRepo).asString();
   }

From 5eea3452e4b3f4a552acf27be3468d7f557fd1d7 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 21:26:43 +0800
Subject: [PATCH 72/73] remove old tests. some chores

---
 engine/e2e-test/api/engines/test_api_engine.py        | 10 ----------
 .../api/engines/test_api_engine_install_nightly.py    |  4 ----
 .../e2e-test/cli/engines/test_cli_engine_install.py   | 11 -----------
 .../cli/engines/test_cli_engine_install_nightly.py    | 11 -----------
 engine/extensions/python-engines/python_utils.cc      |  2 +-
 engine/extensions/python-engines/vllm_engine.cc       |  1 +
 6 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py
index 22fadf5d0..dbdf2dbe9 100644
--- a/engine/e2e-test/api/engines/test_api_engine.py
+++ b/engine/e2e-test/api/engines/test_api_engine.py
@@ -52,16 +52,6 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self):
         response = requests.delete("http://localhost:3928/v1/engines/llama-cpp/install")
         assert response.status_code == 200
 
-    @pytest.mark.asyncio
-    async def test_engines_install_uninstall_python_should_be_successful(self):
-        response = requests.post("http://localhost:3928/v1/engines/python-engine/install")
-        assert response.status_code == 200
-        await wait_for_websocket_download_success_event(timeout=None)
-        time.sleep(30)
-
-        response = requests.delete("http://localhost:3928/v1/engines/python-engine/install")
-        assert response.status_code == 200
-
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self):
         # install first
diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
index 50dbbeee5..e92afb14b 100644
--- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
+++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
@@ -22,10 +22,6 @@ def test_engines_install_llamacpp_should_be_successful(self):
         response = requests.post("http://localhost:3928/v1/engines/llama-cpp/install")
         assert response.status_code == 200
 
-    def test_engines_install_python_should_be_successful(self):
-        response = requests.post("http://localhost:3928/v1/engines/python-engine/install")
-        assert response.status_code == 200
-
     def test_engines_install_llamacpp_specific_version_and_variant(self):
         data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx"}
         response = requests.post(
diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py
index ca298c828..370ebe3f3 100644
--- a/engine/e2e-test/cli/engines/test_cli_engine_install.py
+++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py
@@ -31,17 +31,6 @@ def test_engines_install_llamacpp_should_be_successfully(self):
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
-    def test_engines_install_python_should_be_successfully(self):
-        exit_code, output, error = run(
-            "Install Engine",
-            ["engines", "install", "python-engine"],
-            timeout=None,
-            capture=False,
-        )
-        response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine")
-        assert len(response.json()) > 0
-        assert exit_code == 0, f"Install engine failed with error: {error}"
-
     @pytest.mark.skipif(reason="Ignore onnx-runtime test")
     def test_engines_install_onnx_on_macos_should_be_failed(self):
         exit_code, output, error = run(
diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py
index 68f09aaf3..42835c4a0 100644
--- a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py
+++ b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py
@@ -31,17 +31,6 @@ def test_engines_install_llamacpp_should_be_successfully(self):
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
-    def test_engines_install_python_should_be_successfully(self):
-        exit_code, output, error = run(
-            "Install Engine",
-            ["engines", "install", "python-engine"],
-            timeout=None,
-            capture=False,
-        )
-        response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine")
-        assert len(response.json()) > 0
-        assert exit_code == 0, f"Install engine failed with error: {error}"
-
     @pytest.mark.skipif(reason="Ignore onnx-runtime test")
     def test_engines_install_onnx_on_macos_should_be_failed(self):
         exit_code, output, error = run(
diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc
index 005c36b7c..965b4c324 100644
--- a/engine/extensions/python-engines/python_utils.cc
+++ b/engine/extensions/python-engines/python_utils.cc
@@ -39,7 +39,7 @@ cpp::result<void, std::string> UvInstall() {
   std::filesystem::create_directories(py_bin_path);
 
   // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
-  const std::string uv_version = "0.6.3";
+  const std::string uv_version = "0.6.11";
 
   // build download url based on system info
   std::stringstream fname_stream;
diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc
index 5bdab068a..b05e651c5 100644
--- a/engine/extensions/python-engines/vllm_engine.cc
+++ b/engine/extensions/python-engines/vllm_engine.cc
@@ -81,6 +81,7 @@ std::vector<EngineVariantResponse> VllmEngine::GetVariants() {
   std::vector<EngineVariantResponse> variants;
   for (const auto& entry : fs::directory_iterator(vllm_path)) {
     const auto name = "linux-amd64-cuda";  // arbitrary
+    // TODO: after llama-server is merged, check if we need to add "v"
     const auto version_str = "v" + entry.path().filename().string();
     const EngineVariantResponse variant{name, version_str, kVllmEngine};
     variants.push_back(variant);

From 2bde26a62235c5286c82923b87807c56e66361ee Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 21:30:49 +0800
Subject: [PATCH 73/73] remove unused function

---
 engine/services/inference_service.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index f1d38e76a..86d452c75 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -4,14 +4,6 @@
 #include "utils/function_calling/common.h"
 #include "utils/jinja_utils.h"
 
-static InferResult GetUnsupportedResponse(const std::string& msg) {
-  Json::Value res, stt;
-  res["message"] = msg;
-  stt["status_code"] = drogon::k400BadRequest;
-  LOG_WARN << msg;
-  return std::make_pair(stt, res);
-}
-
 cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     std::shared_ptr<SyncQueue> q, std::shared_ptr<Json::Value> json_body) {
   std::string engine_type;