From 60b13bb4297c9d16cdbf98269577c1d097c13598 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 14 Feb 2025 09:14:51 +0800 Subject: [PATCH 01/73] wip: download uv --- engine/services/engine_service.cc | 77 +++++++++++++++++++++++++++++++ engine/services/engine_service.h | 7 +++ engine/utils/engine_constants.h | 2 +- engine/utils/process/utils.cc | 12 ++--- 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 93cd8605c..da603bbd2 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -223,10 +223,87 @@ cpp::result EngineService::UninstallEngineVariant( } } +cpp::result EngineService::DownloadPythonUv(const std::string& version) { + const std::string engine_name = kPythonEngine; + const std::string python_bin_path = file_manager_utils::GetEnginesContainerPath() / + engine_name / "bin"; + std::filesystem::create_directories(python_bin_path); + + const std::string uv_version = "0.5.30"; + + // NOTE: only works on MacOS and Linux + auto on_finished = [this, engine_name, python_bin_path, uv_version](const DownloadTask& finishedTask) { + // try to unzip the downloaded file + const std::string installer_path = finishedTask.items[0].localPath.string(); + CTL_INF("UV install script path: " << installer_path); + CTL_INF("Version: " << uv_version); + + // https://docs.astral.sh/uv/configuration/installer/ + // TODO: move env var mod logic to SpawnProcess() + // using env to set env vars + // should we download from here instead? https://github.com/astral-sh/uv/releases + std::vector command{"env", + "UV_UNMANAGED_INSTALL=" + python_bin_path, + "sh", + installer_path, + "-q"}; + const auto pid = cortex::process::SpawnProcess(command); + if (pid == -1) { + CTL_ERR("Failed to install uv"); + } + // wait for subprocess to finish + // TODO: need to check return status if successful + waitpid(pid, NULL, 0); + + std::filesystem::remove(installer_path); + + auto create_res = EngineService::UpsertEngine( + engine_name, + kLocal, "", "", uv_version, "", "Default", ""); + + if (create_res.has_value()) { + CTL_ERR("Failed to create engine entry: " << create_res->engine_name); + } else { + CTL_INF("Engine entry created successfully"); + } + + }; + + const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh"; + auto downloadTask = + DownloadTask{.id = "uv", + .type = DownloadType::Engine, + .items = {DownloadItem{ + .id = "uv", + .downloadUrl = url, + .localPath = python_bin_path + "/install.sh", + }}}; + + auto add_task_result = download_service_->AddTask(downloadTask, on_finished); + if (add_task_result.has_error()) { + return cpp::fail(add_task_result.error()); + } + return {}; +} + cpp::result EngineService::DownloadEngine( const std::string& engine, const std::string& version, const std::optional variant_name) { + if (engine == kLlamaRepo) { + return DownloadLlamaCpp(version, variant_name); + } else if (engine == kPythonEngine) { + return DownloadPythonUv(version); + } + // raise error here? + return {}; +} + +cpp::result EngineService::DownloadLlamaCpp( + const std::string& version, + const std::optional variant_name) { + + const std::string engine = kLlamaRepo; auto normalized_version = version == "latest" ? "latest" : string_utils::RemoveSubstring(version, "v"); diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index f98037bab..6cce1761b 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -164,6 +164,13 @@ class EngineService : public EngineServiceI { const std::string& engine, const std::string& version = "latest", const std::optional variant_name = std::nullopt); + cpp::result DownloadLlamaCpp( + const std::string& version = "latest", + const std::optional variant_name = std::nullopt); + + cpp::result DownloadPythonUv( + const std::string& version = "latest"); + cpp::result DownloadCuda(const std::string& engine, bool async = false); diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 35368c519..3cad230bc 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -1,7 +1,7 @@ #pragma once constexpr const auto kLlamaEngine = "llama-cpp"; -constexpr const auto kPythonEngine = "python-engine"; +constexpr const auto kPythonEngine = "python"; constexpr const auto kOpenAiEngine = "openai"; constexpr const auto kAnthropicEngine = "anthropic"; diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index fef425803..1b80f856d 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -80,12 +80,12 @@ pid_t SpawnProcess(const std::vector& command) { auto argv = ConvertToArgv(command); // Use posix_spawn for cross-platform compatibility - auto spawn_result = posix_spawn(&pid, // pid output - command[0].c_str(), // executable path - NULL, // file actions - NULL, // spawn attributes - argv.data(), // argument vector - environ // environment (inherit) + auto spawn_result = posix_spawnp(&pid, // pid output + command[0].c_str(), // executable path + NULL, // file actions + NULL, // spawn attributes + argv.data(), // argument vector + environ // environment (inherit) ); if (spawn_result != 0) { From f9817c8833303b33eb2a4f406405e27718ee623a Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 14 Feb 2025 14:44:32 +0800 Subject: [PATCH 02/73] fix: has_value -> has_error --- engine/services/engine_service.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 42d61aab2..8d85f1079 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -261,7 +261,7 @@ cpp::result EngineService::DownloadPythonUv(const std::string engine_name, kLocal, "", "", uv_version, "", "Default", ""); - if (create_res.has_value()) { + if (create_res.has_error()) { CTL_ERR("Failed to create engine entry: " << create_res->engine_name); } else { CTL_INF("Engine entry created successfully"); From 2dbc29625712c236a4a6f2ef0f67291fb5b40406 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 18 Feb 2025 17:05:00 +0800 Subject: [PATCH 03/73] move uv stuff to python_engine. use uv to start process --- .../extensions/python-engine/python_engine.cc | 202 ++++++++++++------ .../extensions/python-engine/python_engine.h | 6 + engine/services/engine_service.cc | 73 +------ 3 files changed, 147 insertions(+), 134 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index d34f75c08..a1e8cec48 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -64,6 +64,63 @@ static size_t WriteCallback(char* ptr, size_t size, size_t nmemb, } // namespace +cpp::result DownloadUv(std::shared_ptr download_service) { + const std::string py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; + std::filesystem::create_directories(py_bin_path); + + const std::string uv_version = "0.5.31"; + + // NOTE: only works on MacOS and Linux + auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) { + // try to unzip the downloaded file + const std::string installer_path = finishedTask.items[0].localPath.string(); + CTL_INF("UV install script path: " << installer_path); + CTL_INF("Version: " << uv_version); + + // https://docs.astral.sh/uv/configuration/installer/ + // TODO: move env var mod logic to SpawnProcess() + // using env to set env vars + // should we download from here instead? https://github.com/astral-sh/uv/releases + std::vector command{"env", + "UV_UNMANAGED_INSTALL=" + py_bin_path, + "sh", + installer_path, + "-q"}; + const auto pid = cortex::process::SpawnProcess(command); + if (pid == -1) { + CTL_ERR("Failed to install uv"); + } + // wait for subprocess to finish + // TODO: need to check return status if successful + waitpid(pid, NULL, 0); + std::filesystem::remove(installer_path); + }; + + const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh"; + auto downloadTask = + DownloadTask{.id = "uv", + .type = DownloadType::Engine, + .items = {DownloadItem{ + .id = "uv", + .downloadUrl = url, + .localPath = py_bin_path + "/install.sh", + }}}; + + auto add_task_result = download_service->AddTask(downloadTask, on_finished); + if (add_task_result.has_error()) { + return cpp::fail(add_task_result.error()); + } + return {}; +} + +std::string GetUvPath() { + return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; +} + +bool IsUvInstalled() { + return std::filesystem::exists(GetUvPath()); +} + PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {} PythonEngine::~PythonEngine() { @@ -237,74 +294,85 @@ void PythonEngine::LoadModel( return; } - if (!LoadModelConfig(model, model_path)) { - Json::Value error; - error["error"] = "Failed to load model configuration"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - callback(std::move(status), std::move(error)); - return; - } - auto model_config = models_[model]; - auto model_folder_path = model_config.files[0]; - auto data_folder_path = - std::filesystem::path(model_folder_path) / std::filesystem::path("venv"); + // loads yaml into models_ + // if (!LoadModelConfig(model, model_path)) { + // Json::Value error; + // error["error"] = "Failed to load model configuration"; + // Json::Value status; + // status["is_done"] = true; + // status["has_error"] = true; + // status["is_stream"] = false; + // status["status_code"] = k500InternalServerError; + // callback(std::move(status), std::move(error)); + // return; + // } + // auto model_config = models_[model]; + // auto model_folder_path = model_config.files[0]; + // CTL_INF(__func__ << ": model_folder_path=" << model_folder_path); + + // auto data_folder_path = + // std::filesystem::path(model_folder_path) / std::filesystem::path("venv"); try { -#if defined(_WIN32) - auto executable = std::filesystem::path(data_folder_path) / - std::filesystem::path("Scripts"); -#else - auto executable = - std::filesystem::path(data_folder_path) / std::filesystem::path("bin"); -#endif - - auto executable_str = - (executable / std::filesystem::path(model_config.command[0])).string(); - auto command = model_config.command; - command[0] = executable_str; - command.push_back((std::filesystem::path(model_folder_path) / - std::filesystem::path(model_config.script)) - .string()); - std::list args{"--port", - model_config.port, - "--log_path", - (file_manager_utils::GetCortexLogPath() / - std::filesystem::path(model_config.log_path)) - .string(), - "--log_level", - model_config.log_level}; - if (!model_config.extra_params.isNull() && - model_config.extra_params.isObject()) { - for (const auto& key : model_config.extra_params.getMemberNames()) { - const Json::Value& value = model_config.extra_params[key]; - - // Convert key to string with -- prefix - std::string param_key = "--" + key; - - // Handle different JSON value types - if (value.isString()) { - args.emplace_back(param_key); - args.emplace_back(value.asString()); - } else if (value.isInt()) { - args.emplace_back(param_key); - args.emplace_back(std::to_string(value.asInt())); - } else if (value.isDouble()) { - args.emplace_back(param_key); - args.emplace_back(std::to_string(value.asDouble())); - } else if (value.isBool()) { - // For boolean, only add the flag if true - if (value.asBool()) { - args.emplace_back(param_key); - } - } - } - } - - // Add the parsed arguments to the command - command.insert(command.end(), args.begin(), args.end()); +// #if defined(_WIN32) +// auto executable = std::filesystem::path(data_folder_path) / +// std::filesystem::path("Scripts"); +// #else +// auto executable = +// std::filesystem::path(data_folder_path) / std::filesystem::path("bin"); +// #endif + +// auto executable_str = +// (executable / std::filesystem::path(model_config.command[0])).string(); +// auto command = model_config.command; +// command[0] = executable_str; +// command.push_back((std::filesystem::path(model_folder_path) / +// std::filesystem::path(model_config.script)) +// .string()); +// std::list args{"--port", +// model_config.port, +// "--log_path", +// (file_manager_utils::GetCortexLogPath() / +// std::filesystem::path(model_config.log_path)) +// .string(), +// "--log_level", +// model_config.log_level}; +// if (!model_config.extra_params.isNull() && +// model_config.extra_params.isObject()) { +// for (const auto& key : model_config.extra_params.getMemberNames()) { +// const Json::Value& value = model_config.extra_params[key]; + +// // Convert key to string with -- prefix +// std::string param_key = "--" + key; + +// // Handle different JSON value types +// if (value.isString()) { +// args.emplace_back(param_key); +// args.emplace_back(value.asString()); +// } else if (value.isInt()) { +// args.emplace_back(param_key); +// args.emplace_back(std::to_string(value.asInt())); +// } else if (value.isDouble()) { +// args.emplace_back(param_key); +// args.emplace_back(std::to_string(value.asDouble())); +// } else if (value.isBool()) { +// // For boolean, only add the flag if true +// if (value.asBool()) { +// args.emplace_back(param_key); +// } +// } +// } +// } + + // // Add the parsed arguments to the command + // command.insert(command.end(), args.begin(), args.end()); + + std::string uv_path = GetUvPath(); + std::string entrypoint_path = std::filesystem::path(model_path).parent_path() / "main.py"; + std::vector command{uv_path, "run", entrypoint_path}; + + // TODO: what happens if the process exits? + // what should be expected from the subprocess + // TODO: stdout/stderr of subprocess pid = cortex::process::SpawnProcess(command); process_map_[model] = pid; if (pid == -1) { diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 70a9b9829..76d82c961 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -17,6 +17,7 @@ #include "utils/process_status_utils.h" #include "utils/curl_utils.h" #include "utils/process/utils.h" +#include "services/download_service.h" // Helper for CURL response namespace python_engine { @@ -31,6 +32,11 @@ struct CurlResponse { std::string error_message; }; +// UV-related functions +cpp::result DownloadUv(std::shared_ptr download_service); +std::string GetUvPath(); +bool IsUvInstalled(); + class PythonEngine : public EngineI { private: // Model configuration diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 8d85f1079..56c52c14f 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -223,69 +223,6 @@ cpp::result EngineService::UninstallEngineVariant( } } -cpp::result EngineService::DownloadPythonUv(const std::string& version) { - const std::string engine_name = kPythonEngine; - const std::string python_bin_path = file_manager_utils::GetEnginesContainerPath() / - engine_name / "bin"; - std::filesystem::create_directories(python_bin_path); - - const std::string uv_version = "0.5.30"; - - // NOTE: only works on MacOS and Linux - auto on_finished = [this, engine_name, python_bin_path, uv_version](const DownloadTask& finishedTask) { - // try to unzip the downloaded file - const std::string installer_path = finishedTask.items[0].localPath.string(); - CTL_INF("UV install script path: " << installer_path); - CTL_INF("Version: " << uv_version); - - // https://docs.astral.sh/uv/configuration/installer/ - // TODO: move env var mod logic to SpawnProcess() - // using env to set env vars - // should we download from here instead? https://github.com/astral-sh/uv/releases - std::vector command{"env", - "UV_UNMANAGED_INSTALL=" + python_bin_path, - "sh", - installer_path, - "-q"}; - const auto pid = cortex::process::SpawnProcess(command); - if (pid == -1) { - CTL_ERR("Failed to install uv"); - } - // wait for subprocess to finish - // TODO: need to check return status if successful - waitpid(pid, NULL, 0); - - std::filesystem::remove(installer_path); - - auto create_res = EngineService::UpsertEngine( - engine_name, - kLocal, "", "", uv_version, "", "Default", ""); - - if (create_res.has_error()) { - CTL_ERR("Failed to create engine entry: " << create_res->engine_name); - } else { - CTL_INF("Engine entry created successfully"); - } - - }; - - const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh"; - auto downloadTask = - DownloadTask{.id = "uv", - .type = DownloadType::Engine, - .items = {DownloadItem{ - .id = "uv", - .downloadUrl = url, - .localPath = python_bin_path + "/install.sh", - }}}; - - auto add_task_result = download_service_->AddTask(downloadTask, on_finished); - if (add_task_result.has_error()) { - return cpp::fail(add_task_result.error()); - } - return {}; -} - cpp::result EngineService::DownloadEngine( const std::string& engine, const std::string& version, const std::optional variant_name) { @@ -293,10 +230,10 @@ cpp::result EngineService::DownloadEngine( if (engine == kLlamaRepo) { return DownloadLlamaCpp(version, variant_name); } else if (engine == kPythonEngine) { - return DownloadPythonUv(version); + // ignore version and variant_name + return python_engine::DownloadUv(download_service_); } - // raise error here? - return {}; + return cpp::fail("Unknown engine " + engine); } cpp::result EngineService::DownloadLlamaCpp( @@ -988,9 +925,11 @@ cpp::result EngineService::IsEngineReady( return true; } - // End hard code // Check for python engine if (engine == kPythonEngine) { + if (!python_engine::IsUvInstalled()) { + return cpp::fail("Python engine is not ready. Please run `cortex engines install python`"); + } return true; } From eec24bd101f89b2b9593f0182fc64117588242d2 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 18 Feb 2025 18:31:53 +0800 Subject: [PATCH 04/73] redirect stdout/stderr --- .../extensions/python-engine/python_engine.cc | 62 ++++--------------- engine/utils/process/utils.cc | 39 +++++++++++- engine/utils/process/utils.h | 6 +- 3 files changed, 52 insertions(+), 55 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index a1e8cec48..3e01ab26e 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -321,59 +321,19 @@ void PythonEngine::LoadModel( // std::filesystem::path(data_folder_path) / std::filesystem::path("bin"); // #endif -// auto executable_str = -// (executable / std::filesystem::path(model_config.command[0])).string(); -// auto command = model_config.command; -// command[0] = executable_str; -// command.push_back((std::filesystem::path(model_folder_path) / -// std::filesystem::path(model_config.script)) -// .string()); -// std::list args{"--port", -// model_config.port, -// "--log_path", -// (file_manager_utils::GetCortexLogPath() / -// std::filesystem::path(model_config.log_path)) -// .string(), -// "--log_level", -// model_config.log_level}; -// if (!model_config.extra_params.isNull() && -// model_config.extra_params.isObject()) { -// for (const auto& key : model_config.extra_params.getMemberNames()) { -// const Json::Value& value = model_config.extra_params[key]; - -// // Convert key to string with -- prefix -// std::string param_key = "--" + key; - -// // Handle different JSON value types -// if (value.isString()) { -// args.emplace_back(param_key); -// args.emplace_back(value.asString()); -// } else if (value.isInt()) { -// args.emplace_back(param_key); -// args.emplace_back(std::to_string(value.asInt())); -// } else if (value.isDouble()) { -// args.emplace_back(param_key); -// args.emplace_back(std::to_string(value.asDouble())); -// } else if (value.isBool()) { -// // For boolean, only add the flag if true -// if (value.asBool()) { -// args.emplace_back(param_key); -// } -// } -// } -// } - - // // Add the parsed arguments to the command - // command.insert(command.end(), args.begin(), args.end()); - - std::string uv_path = GetUvPath(); - std::string entrypoint_path = std::filesystem::path(model_path).parent_path() / "main.py"; - std::vector command{uv_path, "run", entrypoint_path}; + const std::filesystem::path model_dir = std::filesystem::path(model_path).parent_path(); + std::vector command{GetUvPath(), "run", model_dir / "main.py"}; // TODO: what happens if the process exits? - // what should be expected from the subprocess - // TODO: stdout/stderr of subprocess - pid = cortex::process::SpawnProcess(command); + const std::string stdout_path = model_dir / "stdout.txt"; + const std::string stderr_path = model_dir / "stderr.txt"; + + // create empty stdout.txt and stderr.txt for redirection + if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush(); + if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush(); + + pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path); + process_map_[model] = pid; if (pid == -1) { std::unique_lock lock(models_mutex_); diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 1b80f856d..94433367b 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -1,8 +1,10 @@ #include "utils/process/utils.h" #include "utils/logging_utils.h" +#include #if defined(__APPLE__) || defined(__linux__) extern char **environ; // environment variables +#include #endif namespace cortex::process { @@ -34,7 +36,9 @@ std::vector ConvertToArgv(const std::vector& args) { return argv; } -pid_t SpawnProcess(const std::vector& command) { +pid_t SpawnProcess(const std::vector& command, + const std::optional stdout_file, + const std::optional stderr_file) { try { #if defined(_WIN32) // Windows process creation @@ -79,15 +83,46 @@ pid_t SpawnProcess(const std::vector& command) { // Convert command vector to char*[] auto argv = ConvertToArgv(command); + // redirect stdout and stderr + // caller should make sure the redirect files exist. + posix_spawn_file_actions_t *action_ptr = NULL; + + if (stdout_file.has_value() || stderr_file.has_value()) { + posix_spawn_file_actions_t action; + posix_spawn_file_actions_init(&action); + action_ptr = &action; + + if (stdout_file.has_value()) { + std::string stdout_file_val = stdout_file.value(); + if (std::filesystem::exists(stdout_file_val)) { + posix_spawn_file_actions_addopen(&action, STDOUT_FILENO, + stdout_file_val.data(), + O_WRONLY | O_APPEND, 0); + } + } + + if (stderr_file.has_value()) { + std::string stderr_file_val = stderr_file.value(); + if (std::filesystem::exists(stderr_file_val)) { + posix_spawn_file_actions_addopen(&action, STDERR_FILENO, + stderr_file_val.data(), + O_WRONLY | O_APPEND, 0); + } + } + } + // Use posix_spawn for cross-platform compatibility auto spawn_result = posix_spawnp(&pid, // pid output command[0].c_str(), // executable path - NULL, // file actions + action_ptr, // file actions NULL, // spawn attributes argv.data(), // argument vector environ // environment (inherit) ); + // NOTE: only destroy this when process ends? + // posix_spawn_file_actions_destroy(action_pr); + if (spawn_result != 0) { throw std::runtime_error("Failed to spawn process"); } diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index 9332607e9..54f34e919 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -20,6 +20,8 @@ std::string ConstructWindowsCommandLine(const std::vector& args); std::vector ConvertToArgv(const std::vector& args); -pid_t SpawnProcess(const std::vector& command); +pid_t SpawnProcess(const std::vector& command, + const std::optional stdout_file = {}, + const std::optional stderr_file = {}); -} \ No newline at end of file +} From 26fdbd399ec7fe43d6f23b4877916d5774c4ff99 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 18 Feb 2025 20:05:26 +0800 Subject: [PATCH 05/73] simplify code --- .../extensions/python-engine/python_engine.cc | 41 +++++-------------- engine/services/model_service.cc | 37 ++--------------- 2 files changed, 14 insertions(+), 64 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 3e01ab26e..8c6f6a7b7 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -266,11 +266,10 @@ void PythonEngine::GetModels( void PythonEngine::LoadModel( std::shared_ptr json_body, std::function&& callback) { - // TODO: handle a case that can spawn process but the process spawn fail. - pid_t pid; - if (!json_body->isMember("model") || !json_body->isMember("model_path")) { + + if (!json_body->isMember("model") || !json_body->isMember("model_dir")) { Json::Value error; - error["error"] = "Missing required fields: model or model_path"; + error["error"] = "Missing required fields: model or model_dir"; Json::Value status; status["is_done"] = true; status["has_error"] = true; @@ -280,8 +279,11 @@ void PythonEngine::LoadModel( return; } + namespace fs = std::filesystem; + const std::string& model = (*json_body)["model"].asString(); - const std::string& model_path = (*json_body)["model_path"].asString(); + const fs::path model_dir = (*json_body)["model_dir"].asString(); + if (models_.find(model) != models_.end()) { Json::Value error; error["error"] = "Model already loaded!"; @@ -294,6 +296,9 @@ void PythonEngine::LoadModel( return; } + // TODO: handle a case that can spawn process but the process spawn fail. + pid_t pid; + // loads yaml into models_ // if (!LoadModelConfig(model, model_path)) { // Json::Value error; @@ -310,18 +315,7 @@ void PythonEngine::LoadModel( // auto model_folder_path = model_config.files[0]; // CTL_INF(__func__ << ": model_folder_path=" << model_folder_path); - // auto data_folder_path = - // std::filesystem::path(model_folder_path) / std::filesystem::path("venv"); try { -// #if defined(_WIN32) -// auto executable = std::filesystem::path(data_folder_path) / -// std::filesystem::path("Scripts"); -// #else -// auto executable = -// std::filesystem::path(data_folder_path) / std::filesystem::path("bin"); -// #endif - - const std::filesystem::path model_dir = std::filesystem::path(model_path).parent_path(); std::vector command{GetUvPath(), "run", model_dir / "main.py"}; // TODO: what happens if the process exits? @@ -336,20 +330,7 @@ void PythonEngine::LoadModel( process_map_[model] = pid; if (pid == -1) { - std::unique_lock lock(models_mutex_); - if (models_.find(model) != models_.end()) { - models_.erase(model); - } - - Json::Value error; - error["error"] = "Fail to spawn process with pid -1"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - callback(std::move(status), std::move(error)); - return; + throw std::runtime_error("Fail to spawn process with pid -1"); } } catch (const std::exception& e) { std::unique_lock lock(models_mutex_); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 6dc1642fb..15d5a8dc6 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -805,38 +805,13 @@ cpp::result ModelService::StartModel( // Check if Python model first if (mc.engine == kPythonEngine) { - - config::PythonModelConfig python_model_config; - python_model_config.ReadFromYaml( - - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string()); - // Start all depends model - auto depends = python_model_config.depends; - for (auto& depend : depends) { - Json::Value temp; - auto res = StartModel(depend, temp, false); - if (res.has_error()) { - CTL_WRN("Error: " + res.error()); - for (auto& depend : depends) { - if (depend != model_handle) { - StopModel(depend); - } - } - return cpp::fail("Model failed to start dependency '" + depend + - "' : " + res.error()); - } - } + const std::string model_yaml_path = model_entry.value().path_to_model_yaml; json_data["model"] = model_handle; - json_data["model_path"] = - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string(); + json_data["model_dir"] = fmu::ToAbsoluteCortexDataPath( + fs::path(model_yaml_path).parent_path()).string(); json_data["engine"] = mc.engine; assert(!!inference_svc_); - // Check if python engine auto ir = inference_svc_->LoadModel(std::make_shared(json_data)); @@ -848,12 +823,6 @@ cpp::result ModelService::StartModel( } else if (status == drogon::k409Conflict) { CTL_INF("Model '" + model_handle + "' is already loaded"); return StartModelResult{.success = true, .warning = ""}; - } else { - // only report to user the error - for (auto& depend : depends) { - - StopModel(depend); - } } CTL_ERR("Model failed to start with status code: " << status); return cpp::fail("Model failed to start: " + From 3ba79942dfdf9596b5d345b9288e2ef14965b33d Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 09:08:39 +0800 Subject: [PATCH 06/73] rename python engine interface --- engine/cortex-common/{cortexpythoni.h => python_enginei.h} | 7 +++---- engine/services/engine_service.h | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) rename engine/cortex-common/{cortexpythoni.h => python_enginei.h} (87%) diff --git a/engine/cortex-common/cortexpythoni.h b/engine/cortex-common/python_enginei.h similarity index 87% rename from engine/cortex-common/cortexpythoni.h rename to engine/cortex-common/python_enginei.h index 06a79838f..54e79bf2a 100644 --- a/engine/cortex-common/cortexpythoni.h +++ b/engine/cortex-common/python_enginei.h @@ -5,9 +5,9 @@ #include "json/value.h" -class CortexPythonEngineI { +class PythonEngineI { public: - virtual ~CortexPythonEngineI() {} + virtual ~PythonEngineI() {} virtual bool IsSupported(const std::string& f) = 0; @@ -17,6 +17,5 @@ class CortexPythonEngineI { virtual void HandlePythonFileExecutionRequest( std::shared_ptr json_body, - std::function&& callback) = 0; + std::function&& callback) = 0; }; - diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index 6cce1761b..a8d5415a0 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -9,7 +9,7 @@ #include "common/engine_servicei.h" #include "cortex-common/EngineI.h" -#include "cortex-common/cortexpythoni.h" +#include "cortex-common/python_enginei.h" #include "cortex-common/remote_enginei.h" #include "database/engines.h" #include "services/database_service.h" @@ -37,7 +37,7 @@ struct EngineUpdateResult { } }; -using EngineV = std::variant; +using EngineV = std::variant; class EngineService : public EngineServiceI { private: From 5e7125f09afb7d0cad4aadff5346a1300c51d396 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 09:31:24 +0800 Subject: [PATCH 07/73] use PythonEngineI --- engine/cortex-common/python_enginei.h | 20 +- .../extensions/python-engine/python_engine.cc | 717 +----------------- .../extensions/python-engine/python_engine.h | 69 +- engine/services/inference_service.cc | 74 +- engine/services/inference_service.h | 4 + 5 files changed, 69 insertions(+), 815 deletions(-) diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h index 54e79bf2a..31dc76c80 100644 --- a/engine/cortex-common/python_enginei.h +++ b/engine/cortex-common/python_enginei.h @@ -9,13 +9,23 @@ class PythonEngineI { public: virtual ~PythonEngineI() {} - virtual bool IsSupported(const std::string& f) = 0; + // virtual bool IsSupported(const std::string& f) = 0; - virtual void ExecutePythonFile(std::string binary_execute_path, - std::string file_execution_path, - std::string python_library_path) = 0; + // virtual void ExecutePythonFile(std::string binary_execute_path, + // std::string file_execution_path, + // std::string python_library_path) = 0; - virtual void HandlePythonFileExecutionRequest( + // virtual void HandlePythonFileExecutionRequest( + // std::shared_ptr json_body, + // std::function&& callback) = 0; + + virtual void LoadModel( + std::shared_ptr json_body, + std::function&& callback) = 0; + + virtual void HandleRequest( + const std::string& model, + const std::vector& path_parts, std::shared_ptr json_body, std::function&& callback) = 0; }; diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 8c6f6a7b7..7ab970127 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -64,7 +64,7 @@ static size_t WriteCallback(char* ptr, size_t size, size_t nmemb, } // namespace -cpp::result DownloadUv(std::shared_ptr download_service) { +cpp::result DownloadUv(std::shared_ptr& download_service) { const std::string py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; std::filesystem::create_directories(py_bin_path); @@ -127,142 +127,6 @@ PythonEngine::~PythonEngine() { curl_global_cleanup(); } -config::PythonModelConfig* PythonEngine::GetModelConfig( - const std::string& model) { - std::shared_lock lock(models_mutex_); - auto it = models_.find(model); - if (it != models_.end()) { - return &it->second; - } - return nullptr; -} - -bool PythonEngine::TerminateModelProcess(const std::string& model) { - auto it = process_map_.find(model); - if (it == process_map_.end()) { - LOG_ERROR << "No process found for model: " << model - << ", removing from list running models."; - models_.erase(model); - return false; - } - -#if defined(_WIN32) - HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, it->second); - if (hProcess == NULL) { - LOG_ERROR << "Failed to open process"; - return false; - } - - bool terminated = TerminateProcess(hProcess, 0) == TRUE; - CloseHandle(hProcess); - - if (terminated) { - process_map_.erase(it); - return true; - } - -#elif defined(__APPLE__) || defined(__linux__) - int result = kill(it->second, SIGTERM); - if (result == 0) { - process_map_.erase(it); - return true; - } -#endif - - return false; -} - -CurlResponse PythonEngine::MakeGetRequest(const std::string& model, - const std::string& path) { - auto const& config = models_[model]; - std::string full_url = "http://localhost:" + config.port + path; - CurlResponse response; - - auto result = curl_utils::SimpleRequest(full_url, RequestType::GET); - if (result.has_error()) { - response.error = true; - response.error_message = result.error(); - } else { - response.body = result.value(); - } - return response; -} - -CurlResponse PythonEngine::MakeDeleteRequest(const std::string& model, - const std::string& path) { - auto const& config = models_[model]; - std::string full_url = "http://localhost:" + config.port + path; - CurlResponse response; - - auto result = curl_utils::SimpleRequest(full_url, RequestType::DEL); - - if (result.has_error()) { - response.error = true; - response.error_message = result.error(); - } else { - response.body = result.value(); - } - - return response; -} - -CurlResponse PythonEngine::MakePostRequest(const std::string& model, - const std::string& path, - const std::string& body) { - auto const& config = models_[model]; - std::string full_url = "http://localhost:" + config.port + path; - - CurlResponse response; - auto result = curl_utils::SimpleRequest(full_url, RequestType::POST, body); - - if (result.has_error()) { - response.error = true; - response.error_message = result.error(); - } else { - response.body = result.value(); - } - return response; -} - -bool PythonEngine::LoadModelConfig(const std::string& model, - const std::string& yaml_path) { - try { - config::PythonModelConfig config; - config.ReadFromYaml(yaml_path); - std::unique_lock lock(models_mutex_); - models_[model] = config; - } catch (const std::exception& e) { - LOG_ERROR << "Failed to load model config: " << e.what(); - return false; - } - - return true; -} - -void PythonEngine::GetModels( - std::shared_ptr json_body, - std::function&& callback) { - - Json::Value response_json; - Json::Value model_array(Json::arrayValue); - - for (const auto& pair : models_) { - auto val = pair.second.ToJson(); - model_array.append(val); - } - - response_json["object"] = "list"; - response_json["data"] = model_array; - - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(response_json)); -} - void PythonEngine::LoadModel( std::shared_ptr json_body, std::function&& callback) { @@ -296,25 +160,8 @@ void PythonEngine::LoadModel( return; } - // TODO: handle a case that can spawn process but the process spawn fail. pid_t pid; - // loads yaml into models_ - // if (!LoadModelConfig(model, model_path)) { - // Json::Value error; - // error["error"] = "Failed to load model configuration"; - // Json::Value status; - // status["is_done"] = true; - // status["has_error"] = true; - // status["is_stream"] = false; - // status["status_code"] = k500InternalServerError; - // callback(std::move(status), std::move(error)); - // return; - // } - // auto model_config = models_[model]; - // auto model_folder_path = model_config.files[0]; - // CTL_INF(__func__ << ": model_folder_path=" << model_folder_path); - try { std::vector command{GetUvPath(), "run", model_dir / "main.py"}; @@ -360,568 +207,24 @@ void PythonEngine::LoadModel( callback(std::move(status), std::move(response)); } -void PythonEngine::UnloadModel( - std::shared_ptr json_body, - std::function&& callback) { - if (!json_body->isMember("model")) { - Json::Value error; - error["error"] = "Missing required field: model"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - auto model = (*json_body)["model"].asString(); +void PythonEngine::HandleRequest( + const std::string& model, + const std::vector& path_parts, + std::shared_ptr json_body, + std::function&& callback) { - { - if (TerminateModelProcess(model)) { - std::unique_lock lock(models_mutex_); - models_.erase(model); - } else { - Json::Value error; - error["error"] = "Fail to terminate process with id: " + - std::to_string(process_map_[model]); - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - } - - Json::Value response; - response["status"] = "Model unloaded successfully"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - callback(std::move(status), std::move(response)); -} - -void PythonEngine::HandleChatCompletion( - std::shared_ptr json_body, - std::function&& callback) { - LOG_WARN << "Does not support yet!"; -} - -CurlResponse PythonEngine::MakeStreamPostRequest( - const std::string& model, const std::string& path, const std::string& body, - const std::function& callback) { - auto const& config = models_[model]; - CURL* curl = curl_easy_init(); - CurlResponse response; - - if (!curl) { - response.error = true; - response.error_message = "Failed to initialize CURL"; - return response; - } - - std::string full_url = "http://localhost:" + config.port + path; - - struct curl_slist* headers = nullptr; - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, "Accept: text/event-stream"); - headers = curl_slist_append(headers, "Cache-Control: no-cache"); - headers = curl_slist_append(headers, "Connection: keep-alive"); - - StreamContext context{ - std::make_shared>( - callback), - ""}; - - curl_easy_setopt(curl, CURLOPT_URL, full_url.c_str()); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_POST, 1L); - curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str()); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, StreamWriteCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &context); - curl_easy_setopt(curl, CURLOPT_TRANSFER_ENCODING, 1L); - - CURLcode res = curl_easy_perform(curl); - - if (res != CURLE_OK) { - response.error = true; - response.error_message = curl_easy_strerror(res); - - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = true; - status["status_code"] = 500; - - Json::Value error; - error["error"] = response.error_message; - callback(std::move(status), std::move(error)); - } - - curl_slist_free_all(headers); - curl_easy_cleanup(curl); - return response; -} - -void PythonEngine::HandleInference( - std::shared_ptr json_body, - std::function&& callback) { - if (json_body && !json_body->isMember("model")) { - Json::Value error; - error["error"] = "Missing required field: model is required!"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - std::string method = "post"; - std::string path = "/inference"; - auto transform_request = (*json_body).get("transform_request", "").asString(); - auto transform_response = - (*json_body).get("transform_response", "").asString(); - auto model = (*json_body)["model"].asString(); - auto& body = (*json_body)["body"]; - - if (models_.find(model) == models_.end()) { - Json::Value error; - error["error"] = "Model '" + model + "' is not loaded!"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - // Transform Request - std::string transformed_request; - if (!transform_request.empty()) { - - try { - // Validate JSON body - if (!body || body.isNull()) { - throw std::runtime_error("Invalid or null JSON body"); - } - - // Render with error handling - try { - transformed_request = renderer_.Render(transform_request, body); - - } catch (const std::exception& e) { - throw std::runtime_error("Template rendering error: " + - std::string(e.what())); - } - } catch (const std::exception& e) { - // Log error and potentially rethrow or handle accordingly - LOG_WARN << "Error in TransformRequest: " << e.what(); - LOG_WARN << "Using original request body"; - transformed_request = body.toStyledString(); - } - } else { - transformed_request = body.toStyledString(); - } - - // End Transform request - - CurlResponse response; - if (method == "post") { - if (body.isMember("stream") && body["stream"].asBool()) { - q_.runTaskInQueue( - [this, model, path, transformed_request, cb = std::move(callback)] { - MakeStreamPostRequest(model, path, transformed_request, cb); - }); - - return; - } else { - response = MakePostRequest(model, path, transformed_request); - } - - } else if (method == "get") { - response = MakeGetRequest(model, path); - } else if (method == "delete") { - response = MakeDeleteRequest(model, path); - } else { - Json::Value error; - error["error"] = - "method not supported! Supported methods are: post, get, delete"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - if (response.error) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - Json::Value error; - error["error"] = response.error_message; - callback(std::move(status), std::move(error)); - return; - } + // get port Json::Value response_json; - Json::Reader reader; - if (!reader.parse(response.body, response_json)) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - Json::Value error; - error["error"] = "Failed to parse response"; - callback(std::move(status), std::move(error)); - return; - } - - if (!transform_response.empty()) { - // Transform Response - std::string response_str; - try { - // Validate JSON body - if (!response_json || response_json.isNull()) { - throw std::runtime_error("Invalid or null JSON body"); - } - // Render with error handling - try { - response_str = renderer_.Render(transform_response, response_json); - } catch (const std::exception& e) { - throw std::runtime_error("Template rendering error: " + - std::string(e.what())); - } - } catch (const std::exception& e) { - // Log error and potentially rethrow or handle accordingly - LOG_WARN << "Error in TransformRequest: " << e.what(); - LOG_WARN << "Using original request body"; - response_str = response_json.toStyledString(); - } - - Json::Reader reader_final; - Json::Value response_json_final; - if (!reader_final.parse(response_str, response_json_final)) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - Json::Value error; - error["error"] = "Failed to parse response"; - callback(std::move(status), std::move(error)); - return; - } - - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(response_json_final)); - } else { - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(response_json)); - } -} - -Json::Value PythonEngine::GetRemoteModels() { - return Json::Value(); -} - -void PythonEngine::StopInferencing(const std::string& model_id) {} - -void PythonEngine::HandleRouteRequest( - std::shared_ptr json_body, - std::function&& callback) { - if (!json_body->isMember("model") || !json_body->isMember("method") || - !json_body->isMember("path")) { - Json::Value error; - error["error"] = - "Missing required field: model, method and path are required!"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - auto method = (*json_body)["method"].asString(); - auto path = (*json_body)["path"].asString(); - auto transform_request = (*json_body).get("transform_request", "").asString(); - auto transform_response = - (*json_body).get("transform_response", "").asString(); - auto model = (*json_body)["model"].asString(); - auto& body = (*json_body)["body"]; - - if (models_.find(model) == models_.end()) { - Json::Value error; - error["error"] = "Model '" + model + "' is not loaded!"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - // Transform Request - std::string transformed_request; - if (!transform_request.empty()) { - - try { - // Validate JSON body - if (!body || body.isNull()) { - throw std::runtime_error("Invalid or null JSON body"); - } - - // Render with error handling - try { - transformed_request = renderer_.Render(transform_request, *json_body); - } catch (const std::exception& e) { - throw std::runtime_error("Template rendering error: " + - std::string(e.what())); - } - } catch (const std::exception& e) { - // Log error and potentially rethrow or handle accordingly - LOG_WARN << "Error in TransformRequest: " << e.what(); - LOG_WARN << "Using original request body"; - transformed_request = body.toStyledString(); - } - } else { - transformed_request = body.toStyledString(); - } - - // End Transform request - - CurlResponse response; - if (method == "post") { - response = MakePostRequest(model, path, transformed_request); - } else if (method == "get") { - response = MakeGetRequest(model, path); - } else if (method == "delete") { - response = MakeDeleteRequest(model, path); - } else { - Json::Value error; - error["error"] = - "method not supported! Supported methods are: post, get, delete"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - if (response.error) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - Json::Value error; - error["error"] = response.error_message; - callback(std::move(status), std::move(error)); - return; - } - - Json::Value response_json; - Json::Reader reader; - if (!reader.parse(response.body, response_json)) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - Json::Value error; - error["error"] = "Failed to parse response"; - callback(std::move(status), std::move(error)); - return; - } - - if (!transform_response.empty()) { - // Transform Response - std::string response_str; - try { - // Validate JSON body - if (!response_json || response_json.isNull()) { - throw std::runtime_error("Invalid or null JSON body"); - } - // Render with error handling - try { - response_str = renderer_.Render(transform_response, response_json); - } catch (const std::exception& e) { - throw std::runtime_error("Template rendering error: " + - std::string(e.what())); - } - } catch (const std::exception& e) { - // Log error and potentially rethrow or handle accordingly - LOG_WARN << "Error in TransformRequest: " << e.what(); - LOG_WARN << "Using original request body"; - response_str = response_json.toStyledString(); - } - - Json::Reader reader_final; - Json::Value response_json_final; - if (!reader_final.parse(response_str, response_json_final)) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - Json::Value error; - error["error"] = "Failed to parse response"; - callback(std::move(status), std::move(error)); - return; - } - - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(response_json_final)); - } else { - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(response_json)); - } -} - -void PythonEngine::GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) { - if (!json_body->isMember("model")) { - Json::Value error; - error["error"] = "Missing required field: model"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - callback(std::move(status), std::move(error)); - return; - } - - auto model = json_body->get("model", "").asString(); - auto model_config = models_[model]; - auto health_endpoint = model_config.heath_check; - auto pid = process_map_[model]; - auto is_process_live = process_status_utils::IsProcessRunning(pid); - auto response_health = MakeGetRequest(model, health_endpoint.path); - - if (response_health.error && is_process_live) { - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - Json::Value message; - message["message"] = "model '"+model+"' is loading"; - callback(std::move(status), std::move(message)); - return; - } - else if(response_health.error && !is_process_live){ - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; - Json::Value message; - message["message"] = response_health.error_message; - callback(std::move(status), std::move(message)); - return; - } - - Json::Value response; - response["model"] = model; - response["model_loaded"] = true; - response["model_data"] = model_config.ToJson(); + response_json["object"] = "list"; Json::Value status; status["is_done"] = true; status["has_error"] = false; status["is_stream"] = false; status["status_code"] = k200OK; - callback(std::move(status), std::move(response)); -} -// Implement remaining virtual functions -void PythonEngine::HandleEmbedding( - std::shared_ptr, - std::function&& callback) { - callback(Json::Value(), Json::Value()); -} - -bool PythonEngine::IsSupported(const std::string& f) { - if (f == "HandleChatCompletion" || f == "LoadModel" || f == "UnloadModel" || - f == "GetModelStatus" || f == "GetModels" || f == "SetFileLogger" || - f == "SetLogLevel") { - return true; - } - return false; -} - -bool PythonEngine::SetFileLogger(int max_log_lines, - const std::string& log_path) { - if (!async_file_logger_) { - async_file_logger_ = std::make_unique(); - } - - async_file_logger_->setFileName(log_path); - async_file_logger_->setMaxLines(max_log_lines); // Keep last 100000 lines - async_file_logger_->startLogging(); - trantor::Logger::setOutputFunction( - [&](const char* msg, const uint64_t len) { - if (async_file_logger_) - async_file_logger_->output_(msg, len); - }, - [&]() { - if (async_file_logger_) - async_file_logger_->flush(); - }); - freopen(log_path.c_str(), "w", stderr); - freopen(log_path.c_str(), "w", stdout); - return true; -} - -void PythonEngine::SetLogLevel(trantor::Logger::LogLevel log_level) { - trantor::Logger::setLogLevel(log_level); + callback(std::move(status), std::move(response_json)); } -void PythonEngine::Load(EngineLoadOption opts) { - // Develop register model here on loading engine -}; - -void PythonEngine::Unload(EngineUnloadOption opts) { - for (const auto& pair : models_) { - TerminateModelProcess(pair.first); - } -}; - -} // namespace python_engine \ No newline at end of file +} // namespace python_engine diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 76d82c961..bf993bcbe 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -10,7 +10,7 @@ #include "config/model_config.h" #include "trantor/utils/ConcurrentTaskQueue.h" -#include "cortex-common/EngineI.h" +#include "cortex-common/python_enginei.h" #include "extensions/template_renderer.h" #include "utils/file_logger.h" #include "utils/file_manager_utils.h" @@ -33,11 +33,11 @@ struct CurlResponse { }; // UV-related functions -cpp::result DownloadUv(std::shared_ptr download_service); +cpp::result DownloadUv(std::shared_ptr& download_service); std::string GetUvPath(); bool IsUvInstalled(); -class PythonEngine : public EngineI { +class PythonEngine : public PythonEngineI { private: // Model configuration @@ -49,69 +49,18 @@ class PythonEngine : public EngineI { std::unordered_map process_map_; trantor::ConcurrentTaskQueue q_; - // Helper functions - CurlResponse MakePostRequest(const std::string& model, - const std::string& path, - const std::string& body); - CurlResponse MakeGetRequest(const std::string& model, - const std::string& path); - CurlResponse MakeDeleteRequest(const std::string& model, - const std::string& path); - CurlResponse MakeStreamPostRequest( - const std::string& model, const std::string& path, - const std::string& body, - const std::function& callback); - - // Process manager functions - bool TerminateModelProcess(const std::string& model); - - // Internal model management - bool LoadModelConfig(const std::string& model, const std::string& yaml_path); - config::PythonModelConfig* GetModelConfig(const std::string& model); - public: PythonEngine(); ~PythonEngine(); - void Load(EngineLoadOption opts) override; - - void Unload(EngineUnloadOption opts) override; - - // Main interface implementations - void GetModels( - std::shared_ptr json_body, - std::function&& callback) override; - - void HandleChatCompletion( - std::shared_ptr json_body, - std::function&& callback) override; - void LoadModel( - std::shared_ptr json_body, - std::function&& callback) override; - - void UnloadModel( - std::shared_ptr json_body, - std::function&& callback) override; + std::shared_ptr json_body, + std::function&& callback) override; - void GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) override; - - // Other required virtual functions - void HandleEmbedding( - std::shared_ptr json_body, - std::function&& callback) override; - bool IsSupported(const std::string& feature) override; - bool SetFileLogger(int max_log_lines, const std::string& log_path) override; - void SetLogLevel(trantor::Logger::LogLevel logLevel) override; - void HandleRouteRequest( - std::shared_ptr json_body, - std::function&& callback) override; - void HandleInference( + void HandleRequest( + const std::string& model, + const std::vector& path_parts, std::shared_ptr json_body, std::function&& callback) override; - Json::Value GetRemoteModels() override; - void StopInferencing(const std::string& model_id) override; }; -} // namespace python_engine \ No newline at end of file +} // namespace python_engine diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index 4ea9ebdfd..07bd3a306 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -182,6 +182,30 @@ cpp::result InferenceService::HandleRouteRequest( return {}; } +InferResult InferenceService::HandlePython( + const std::string& model, const std::vector& path_parts, + std::shared_ptr json_body) { + + Json::Value stt, res; + + auto engine_result = engine_service_->GetLoadedEngine(kPythonEngine); + if (engine_result.has_error()) { + res["message"] = "Python engine is not loaded yet"; + stt["status_code"] = drogon::k400BadRequest; + LOG_WARN << "Python engine is not loaded yet"; + return std::make_pair(stt, res); + } + + auto cb = [&stt, &res](Json::Value s, Json::Value r) { + stt = s; + res = r; + }; + std::get(engine_result.value()) + ->HandleRequest(model, path_parts, json_body, cb); + + return std::make_pair(stt, res); +} + InferResult InferenceService::LoadModel( std::shared_ptr json_body) { std::string engine_type; @@ -204,17 +228,20 @@ InferResult InferenceService::LoadModel( } // might need mutex here - auto engine_result = engine_service_->GetLoadedEngine(engine_type); + auto engine = engine_service_->GetLoadedEngine(engine_type).value(); auto cb = [&stt, &r](Json::Value status, Json::Value res) { stt = status; r = res; }; - if (std::holds_alternative(engine_result.value())) { - std::get(engine_result.value()) + if (std::holds_alternative(engine)) { + std::get(engine) + ->LoadModel(json_body, std::move(cb)); + } else if (std::holds_alternative(engine)) { + std::get(engine) ->LoadModel(json_body, std::move(cb)); } else { - std::get(engine_result.value()) + std::get(engine) ->LoadModel(json_body, std::move(cb)); } if (!engine_service_->IsRemoteEngine(engine_type)) { @@ -340,47 +367,8 @@ InferResult InferenceService::FineTuning( Json::Value r; Json::Value stt; - // TODO: namh refactor this - // if (engines_.find(ne) == engines_.end()) { - // try { - // std::string abs_path = - // (getenv("ENGINE_PATH") - // ? getenv("ENGINE_PATH") - // : file_manager_utils::GetCortexDataPath().string()) + - // kPythonRuntimeLibPath; - // engines_[ne].dl = std::make_unique(abs_path, "engine"); - // } catch (const cortex_cpp::dylib::load_error& e) { - // - // LOG_ERROR << "Could not load engine: " << e.what(); - // engines_.erase(ne); - // - // Json::Value res; - // r["message"] = "Could not load engine " + ne; - // stt["status_code"] = drogon::k500InternalServerError; - // return std::make_pair(stt, r); - // } - // - // auto func = - // engines_[ne].dl->get_function("get_engine"); - // engines_[ne].engine = func(); - // LOG_INFO << "Loaded engine: " << ne; - // } - // - // LOG_TRACE << "Start to fine-tuning"; - // auto& en = std::get(engines_[ne].engine); - // if (en->IsSupported("HandlePythonFileExecutionRequest")) { - // en->HandlePythonFileExecutionRequest( - // json_body, [&r, &stt](Json::Value status, Json::Value res) { - // r = res; - // stt = status; - // }); - // } else { - // LOG_WARN << "Method is not supported yet"; r["message"] = "Method is not supported yet"; stt["status_code"] = drogon::k500InternalServerError; - // return std::make_pair(stt, r); - // } - // LOG_TRACE << "Done fine-tuning"; return std::make_pair(stt, r); } diff --git a/engine/services/inference_service.h b/engine/services/inference_service.h index 726275bba..874ce8c85 100644 --- a/engine/services/inference_service.h +++ b/engine/services/inference_service.h @@ -48,6 +48,10 @@ class InferenceService { cpp::result HandleRouteRequest( std::shared_ptr q, std::shared_ptr json_body); + InferResult HandlePython( + const std::string& model, const std::vector& path_parts, + std::shared_ptr json_body); + InferResult LoadModel(std::shared_ptr json_body); InferResult UnloadModel(const std::string& engine, From c5da0ee70e61e36a30d5018c5001f43ce95ac9ff Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 10:04:12 +0800 Subject: [PATCH 08/73] more checks to match all EngineV variants --- .../extensions/python-engine/python_engine.cc | 21 ++++++++++ .../extensions/python-engine/python_engine.h | 17 ++++++-- engine/services/inference_service.cc | 39 +++++++++++++++---- 3 files changed, 66 insertions(+), 11 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 7ab970127..0b8efa6e9 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -227,4 +227,25 @@ void PythonEngine::HandleRequest( callback(std::move(status), std::move(response_json)); } +void PythonEngine::UnloadModel( + std::shared_ptr json_body, + std::function&& callback) { + + assert(false && "Not implemented"); +} + +void PythonEngine::GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) { + + assert(false && "Not implemented"); +} + +void PythonEngine::GetModels( + std::shared_ptr jsonBody, + std::function&& callback) { + + assert(false && "Not implemented"); +} + } // namespace python_engine diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index bf993bcbe..717bb1b4e 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -56,11 +56,20 @@ class PythonEngine : public PythonEngineI { void LoadModel( std::shared_ptr json_body, std::function&& callback) override; + void UnloadModel( + std::shared_ptr json_body, + std::function&& callback) override; + void GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) override; + void GetModels( + std::shared_ptr jsonBody, + std::function&& callback) override; void HandleRequest( - const std::string& model, - const std::vector& path_parts, - std::shared_ptr json_body, - std::function&& callback) override; + const std::string& model, + const std::vector& path_parts, + std::shared_ptr json_body, + std::function&& callback) override; }; } // namespace python_engine diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index 07bd3a306..aac314399 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -4,6 +4,14 @@ #include "utils/function_calling/common.h" #include "utils/jinja_utils.h" +static InferResult GetUnsupportedResponse(const std::string& msg) { + Json::Value res, stt; + res["message"] = msg; + stt["status_code"] = drogon::k400BadRequest; + LOG_WARN << msg; + return std::make_pair(stt, res); +} + cpp::result InferenceService::HandleChatCompletion( std::shared_ptr q, std::shared_ptr json_body) { std::string engine_type; @@ -38,7 +46,7 @@ cpp::result InferenceService::HandleChatCompletion( LOG_WARN << "Engine is not loaded yet"; return cpp::fail(std::make_pair(stt, res)); } - + if (!model_id.empty()) { if (auto model_service = model_service_.lock()) { auto metadata_ptr = model_service->GetCachedModelMetadata(model_id); @@ -84,6 +92,9 @@ cpp::result InferenceService::HandleChatCompletion( if (std::holds_alternative(engine_result.value())) { std::get(engine_result.value()) ->HandleChatCompletion(json_body, std::move(cb)); + } else if (std::holds_alternative(engine_result.value())) { + return cpp::fail(GetUnsupportedResponse( + "Python engine does not support Chat completion")); } else { std::get(engine_result.value()) ->HandleChatCompletion(json_body, std::move(cb)); @@ -117,6 +128,9 @@ cpp::result InferenceService::HandleEmbedding( if (std::holds_alternative(engine_result.value())) { std::get(engine_result.value()) ->HandleEmbedding(json_body, std::move(cb)); + } else if (std::holds_alternative(engine_result.value())) { + return cpp::fail(GetUnsupportedResponse( + "Python engine does not support Embedding")); } else { std::get(engine_result.value()) ->HandleEmbedding(json_body, std::move(cb)); @@ -274,11 +288,15 @@ InferResult InferenceService::UnloadModel(const std::string& engine_name, stt = status; r = res; }; - if (std::holds_alternative(engine_result.value())) { - std::get(engine_result.value()) + auto engine = engine_result.value(); + if (std::holds_alternative(engine)) { + std::get(engine) + ->UnloadModel(std::make_shared(json_body), std::move(cb)); + } else if (std::holds_alternative(engine)) { + std::get(engine) ->UnloadModel(std::make_shared(json_body), std::move(cb)); } else { - std::get(engine_result.value()) + std::get(engine) ->UnloadModel(std::make_shared(json_body), std::move(cb)); } @@ -312,11 +330,15 @@ InferResult InferenceService::GetModelStatus( stt = status; r = res; }; - if (std::holds_alternative(engine_result.value())) { - std::get(engine_result.value()) + auto engine = engine_result.value(); + if (std::holds_alternative(engine)) { + std::get(engine) + ->GetModelStatus(json_body, std::move(cb)); + } else if (std::holds_alternative(engine)) { + std::get(engine) ->GetModelStatus(json_body, std::move(cb)); } else { - std::get(engine_result.value()) + std::get(engine) ->GetModelStatus(json_body, std::move(cb)); } @@ -348,6 +370,9 @@ InferResult InferenceService::GetModels( if (e->IsSupported("GetModels")) { e->GetModels(json_body, std::move(cb)); } + } else if (std::holds_alternative(loaded_engine)) { + std::get(loaded_engine) + ->GetModels(json_body, std::move(cb)); } else { std::get(loaded_engine) ->GetModels(json_body, std::move(cb)); From 3c097fbdecd25e0ca40a9b0f8c045eba1ecda670 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 11:30:12 +0800 Subject: [PATCH 09/73] improve Python load model --- engine/cortex-common/python_enginei.h | 18 +- .../extensions/python-engine/python_engine.cc | 177 +++++++----------- .../extensions/python-engine/python_engine.h | 16 +- 3 files changed, 84 insertions(+), 127 deletions(-) diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h index 31dc76c80..481b6b146 100644 --- a/engine/cortex-common/python_enginei.h +++ b/engine/cortex-common/python_enginei.h @@ -11,17 +11,19 @@ class PythonEngineI { // virtual bool IsSupported(const std::string& f) = 0; - // virtual void ExecutePythonFile(std::string binary_execute_path, - // std::string file_execution_path, - // std::string python_library_path) = 0; - - // virtual void HandlePythonFileExecutionRequest( - // std::shared_ptr json_body, - // std::function&& callback) = 0; - + // model management virtual void LoadModel( std::shared_ptr json_body, std::function&& callback) = 0; + virtual void UnloadModel( + std::shared_ptr json_body, + std::function&& callback) = 0; + virtual void GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) = 0; + virtual void GetModels( + std::shared_ptr jsonBody, + std::function&& callback) = 0; virtual void HandleRequest( const std::string& model, diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 0b8efa6e9..49b2835a5 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -11,57 +11,6 @@ constexpr const int k400BadRequest = 400; constexpr const int k409Conflict = 409; constexpr const int k500InternalServerError = 500; constexpr const int kFileLoggerOption = 0; - -size_t StreamWriteCallback(char* ptr, size_t size, size_t nmemb, - void* userdata) { - auto* context = static_cast(userdata); - std::string chunk(ptr, size * nmemb); - - context->buffer += chunk; - - // Process complete lines - size_t pos; - while ((pos = context->buffer.find('\n')) != std::string::npos) { - std::string line = context->buffer.substr(0, pos); - context->buffer = context->buffer.substr(pos + 1); - LOG_DEBUG << "line: " << line; - - // Skip empty lines - if (line.empty() || line == "\r") - continue; - - if (line == "data: [DONE]") { - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = true; - status["status_code"] = 200; - (*context->callback)(std::move(status), Json::Value()); - break; - } - - // Parse the JSON - Json::Value chunk_json; - chunk_json["data"] = line + "\n\n"; - Json::Reader reader; - - Json::Value status; - status["is_done"] = false; - status["has_error"] = false; - status["is_stream"] = true; - status["status_code"] = 200; - (*context->callback)(std::move(status), std::move(chunk_json)); - } - - return size * nmemb; -} - -static size_t WriteCallback(char* ptr, size_t size, size_t nmemb, - std::string* data) { - data->append(ptr, size * nmemb); - return size * nmemb; -} - } // namespace cpp::result DownloadUv(std::shared_ptr& download_service) { @@ -127,18 +76,35 @@ PythonEngine::~PythonEngine() { curl_global_cleanup(); } +static std::pair CreateResponse( + const std::string& msg, int code) { + + Json::Value status, res; + const bool has_error = code != k200OK; + + status["is_done"] = true; + status["has_error"] = has_error; + status["is_stream"] = false; + status["status_code"] = code; + + if (has_error) { + CTL_ERR(msg); + res["error"] = msg; + } + else { + res["status"] = msg; + } + + return {status, res}; +} + void PythonEngine::LoadModel( std::shared_ptr json_body, std::function&& callback) { if (!json_body->isMember("model") || !json_body->isMember("model_dir")) { - Json::Value error; - error["error"] = "Missing required fields: model or model_dir"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k400BadRequest; + auto [status, error] = CreateResponse( + "Missing required fields: model or model_dir", k400BadRequest); callback(std::move(status), std::move(error)); return; } @@ -148,24 +114,34 @@ void PythonEngine::LoadModel( const std::string& model = (*json_body)["model"].asString(); const fs::path model_dir = (*json_body)["model_dir"].asString(); - if (models_.find(model) != models_.end()) { - Json::Value error; - error["error"] = "Model already loaded!"; - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k409Conflict; + if (model_process_map.find(model) != model_process_map.end()) { + auto [status, error] = CreateResponse( + "Model already loaded!", k409Conflict); callback(std::move(status), std::move(error)); return; } pid_t pid; - try { - std::vector command{GetUvPath(), "run", model_dir / "main.py"}; + auto model_config = YAML::LoadFile(model_dir / "model.yml"); + if (!model_config["entrypoint"]) + throw std::runtime_error("`entrypoint` is not defined in model.yml"); + if (!model_config["port"]) + throw std::runtime_error("`port` is not defined in model.yaml"); + + const std::string entrypoint = model_config["entrypoint"].as(); + const int port = model_config["port"].as(); + + // NOTE: model_dir / entrypoint assumes a Python script + // TODO: figure out if we can support arbitrary CLI (but still launch by uv) + std::vector command{GetUvPath(), "run", model_dir / entrypoint}; + + auto extra_args_node = model_config["extra_args"]; + if (extra_args_node && extra_args_node.IsSequence()) { + for (int i = 0; i < extra_args_node.size(); i++) + command.push_back(extra_args_node[i].as()); + } - // TODO: what happens if the process exits? const std::string stdout_path = model_dir / "stdout.txt"; const std::string stderr_path = model_dir / "stderr.txt"; @@ -173,58 +149,25 @@ void PythonEngine::LoadModel( if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush(); if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush(); + // TODO: what happens if the process starts, but exits? pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path); - - process_map_[model] = pid; if (pid == -1) { throw std::runtime_error("Fail to spawn process with pid -1"); } - } catch (const std::exception& e) { - std::unique_lock lock(models_mutex_); - if (models_.find(model) != models_.end()) { - models_.erase(model); - } + std::unique_lock write_lock(mutex); + model_process_map[model] = {pid, port}; - Json::Value error; - error["error"] = e.what(); - Json::Value status; - status["is_done"] = true; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; + } catch (const std::exception& e) { + auto e_msg = e.what(); + auto [status, error] = CreateResponse(e_msg, k500InternalServerError); callback(std::move(status), std::move(error)); return; } - Json::Value response; - response["status"] = - "Model loaded successfully with pid: " + std::to_string(pid); - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - callback(std::move(status), std::move(response)); -} - -void PythonEngine::HandleRequest( - const std::string& model, - const std::vector& path_parts, - std::shared_ptr json_body, - std::function&& callback) { - - // get port - - Json::Value response_json; - response_json["object"] = "list"; - - Json::Value status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(response_json)); + auto [status, res] = CreateResponse( + "Model loaded successfully with pid: " + std::to_string(pid), + k200OK); + callback(std::move(status), std::move(res)); } void PythonEngine::UnloadModel( @@ -248,4 +191,14 @@ void PythonEngine::GetModels( assert(false && "Not implemented"); } +void PythonEngine::HandleRequest( + const std::string& model, + const std::vector& path_parts, + std::shared_ptr json_body, + std::function&& callback) { + + assert(false && "Not implemented"); + // get port +} + } // namespace python_engine diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 717bb1b4e..553f49b9b 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -39,14 +39,16 @@ bool IsUvInstalled(); class PythonEngine : public PythonEngineI { private: - // Model configuration + // extensions::TemplateRenderer renderer_; + // std::unique_ptr async_file_logger_; - // Thread-safe model config storage - mutable std::shared_mutex models_mutex_; - std::unordered_map models_; - extensions::TemplateRenderer renderer_; - std::unique_ptr async_file_logger_; - std::unordered_map process_map_; + struct PythonSubprocess { + pid_t pid; + int port; + }; + + mutable std::shared_mutex mutex; + std::unordered_map model_process_map; trantor::ConcurrentTaskQueue q_; public: From 84db8b0857857bad11d66fe751c8ab09f401b3e5 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 14:40:15 +0800 Subject: [PATCH 10/73] consolidate process-related functions --- engine/utils/process/utils.cc | 69 ++++++++++++++++++++++++++++++++++- engine/utils/process/utils.h | 2 + 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 94433367b..624b62262 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -2,8 +2,11 @@ #include "utils/logging_utils.h" #include -#if defined(__APPLE__) || defined(__linux__) +#ifdef _WIN32 +#include +#elif defined(__APPLE__) || defined(__linux__) extern char **environ; // environment variables +#include #include #endif @@ -138,4 +141,66 @@ pid_t SpawnProcess(const std::vector& command, } } -} // namespace cortex::process \ No newline at end of file +bool IsProcessAlive(pid_t pid) { +#ifdef _WIN32 + // Windows implementation + HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0); + if (snapshot == INVALID_HANDLE_VALUE) { + return false; + } + + PROCESSENTRY32 processEntry = {0}; + processEntry.dwSize = sizeof(processEntry); + + if (Process32First(snapshot, &processEntry)) { + do { + if (processEntry.th32ProcessID == pid) { + CloseHandle(snapshot); + return true; + } + } while (Process32Next(snapshot, &processEntry)); + } + + CloseHandle(snapshot); + return false; + +#elif defined(__APPLE__) || defined(__linux__) + // Unix-like systems (Linux and macOS) implementation + if (pid <= 0) { + return false; + } + + // Try to send signal 0 to the process + // This doesn't actually send a signal but checks if we can send signals to the process + int result = kill(pid, 0); + + if (result == 0) { + return true; // Process exists and we have permission to send it signals + } + + return errno != ESRCH; // ESRCH means "no such process" +#else +#error "Unsupported platform" +#endif +} + +bool KillProcess(pid_t pid) { +#if defined(_WIN32) + HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, pid); + if (hProcess == NULL) { + LOG_ERROR << "Failed to open process"; + return false; + } + + bool is_success = TerminateProcess(hProcess, 0) == TRUE; + CloseHandle(hProcess); + return is_success; +#elif defined(__APPLE__) || defined(__linux__) + // NOTE: should we use SIGKILL here to be consistent with Windows? + return kill(pid, SIGTERM) == 0; +#else +#error "Unsupported platform" +#endif +} + +} // namespace cortex::process diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index 54f34e919..813d53750 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -23,5 +23,7 @@ std::vector ConvertToArgv(const std::vector& args); pid_t SpawnProcess(const std::vector& command, const std::optional stdout_file = {}, const std::optional stderr_file = {}); +bool IsProcessAlive(pid_t pid); +bool KillProcess(pid_t pid); } From 8ee815c8804b61cd7b350e278a635ec608f8c57f Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 14:41:10 +0800 Subject: [PATCH 11/73] update PythonModelConfig. add UnloadModel --- engine/config/model_config.h | 347 ++++-------------- engine/controllers/models.cc | 2 +- .../extensions/python-engine/python_engine.cc | 98 ++++- .../extensions/python-engine/python_engine.h | 3 + engine/services/model_service.cc | 66 +--- 5 files changed, 155 insertions(+), 361 deletions(-) diff --git a/engine/config/model_config.h b/engine/config/model_config.h index 1d51cfb01..85335c37b 100644 --- a/engine/config/model_config.h +++ b/engine/config/model_config.h @@ -478,108 +478,41 @@ struct Endpoint { struct PythonModelConfig { // General Metadata - std::string id; - std::string model; std::string name; int version; - // Inference Parameters - Endpoint load_model; - Endpoint destroy; - Endpoint inference; - Endpoint heath_check; - std::vector extra_endpoints; - // Model Load Parameters - std::string port; - std::string script; - std::string log_path; - std::string log_level; - std::string environment; - std::vector command; // New command field - std::vector files; - std::vector depends; std::string engine; - Json::Value extra_params; // Accept dynamic extra parameters + std::string entrypoint; + int port; + std::vector extra_args; // Method to convert C++ struct to YAML void ToYaml(const std::string& filepath) const { YAML::Emitter out; out << YAML::BeginMap; - out << YAML::Key << "id" << YAML::Value << id; - out << YAML::Key << "model" << YAML::Value << model; + // General Metadata out << YAML::Key << "name" << YAML::Value << name; out << YAML::Key << "version" << YAML::Value << version; - // Inference Parameters - out << YAML::Key << "load_model" << YAML::Value << YAML::BeginMap; - out << YAML::Key << "method" << YAML::Value << load_model.method; - out << YAML::Key << "path" << YAML::Value << load_model.path; - out << YAML::Key << "transform_request" << YAML::Value - << load_model.transform_request; - out << YAML::Key << "transform_response" << YAML::Value - << load_model.transform_response; - out << YAML::EndMap; - - out << YAML::Key << "destroy" << YAML::Value << YAML::BeginMap; - out << YAML::Key << "method" << YAML::Value << destroy.method; - out << YAML::Key << "path" << YAML::Value << destroy.path; - out << YAML::EndMap; - - out << YAML::Key << "inference" << YAML::Value << YAML::BeginMap; - out << YAML::Key << "method" << YAML::Value << inference.method; - out << YAML::Key << "path" << YAML::Value << inference.path; - out << YAML::EndMap; - - out << YAML::Key << "extra_endpoints" << YAML::Value << YAML::BeginSeq; - for (const auto& endpoint : extra_endpoints) { - out << YAML::BeginMap; - out << YAML::Key << "method" << YAML::Value << endpoint.method; - out << YAML::Key << "path" << YAML::Value << endpoint.path; - out << YAML::EndMap; - } - out << YAML::EndSeq; - // Model Load Parameters + out << YAML::Key << "engine" << YAML::Value << engine; + out << YAML::Key << "entrypoint" << YAML::Value << entrypoint; out << YAML::Key << "port" << YAML::Value << port; - out << YAML::Key << "script" << YAML::Value << script; - out << YAML::Key << "log_path" << YAML::Value << log_path; - out << YAML::Key << "log_level" << YAML::Value << log_level; - out << YAML::Key << "environment" << YAML::Value << environment; - - // Serialize command as YAML list - out << YAML::Key << "command" << YAML::Value << YAML::BeginSeq; - for (const auto& cmd : command) { - out << cmd; - } - out << YAML::EndSeq; - // Serialize files as YAML list - out << YAML::Key << "files" << YAML::Value << YAML::BeginSeq; - for (const auto& file : files) { - out << file; - } - out << YAML::EndSeq; - - // Serialize command as YAML list - out << YAML::Key << "depends" << YAML::Value << YAML::BeginSeq; - for (const auto& depend : depends) { - out << depend; + // Extra Arguments + if (!extra_args.empty()) { + out << YAML::Key << "extra_args" << YAML::Value << YAML::BeginSeq; + for (const auto& arg : extra_args) { + out << arg; + } + out << YAML::EndSeq; } - out << YAML::EndSeq; - - out << YAML::Key << "engine" << YAML::Value << engine; - // Serialize extra_params as YAML - out << YAML::Key << "extra_params" << YAML::Value << YAML::BeginMap; - for (Json::ValueConstIterator iter = extra_params.begin(); - iter != extra_params.end(); ++iter) { - out << YAML::Key << iter.key().asString() << YAML::Value - << iter->asString(); - } out << YAML::EndMap; + // Write to file std::ofstream fout(filepath); if (!fout.is_open()) { throw std::runtime_error("Failed to open file for writing: " + filepath); @@ -589,218 +522,82 @@ struct PythonModelConfig { // Method to populate struct from YAML file void ReadFromYaml(const std::string& filePath) { - YAML::Node config = YAML::LoadFile(filePath); - - if (config["id"]) - id = config["id"].as(); - if (config["model"]) - model = config["model"].as(); - if (config["name"]) - name = config["name"].as(); - if (config["version"]) - version = config["version"].as(); - - // Inference Parameters - - auto ip = config; - if (ip["load_model"]) { - load_model.method = ip["load_model"]["method"].as(); - load_model.path = ip["load_model"]["path"].as(); - load_model.transform_request = - ip["load_model"]["transform_request"].as(); - load_model.transform_response = - ip["load_model"]["transform_response"].as(); - } - if (ip["destroy"]) { - destroy.method = ip["destroy"]["method"].as(); - destroy.path = ip["destroy"]["path"].as(); - } - if (ip["inference"]) { - inference.method = ip["inference"]["method"].as(); - inference.path = ip["inference"]["path"].as(); - } - if (ip["extra_endpoints"] && ip["extra_endpoints"].IsSequence()) { - for (const auto& endpoint : ip["extra_endpoints"]) { - Endpoint e; - e.method = endpoint["method"].as(); - e.path = endpoint["path"].as(); - extra_endpoints.push_back(e); - } - } - - // Model Load Parameters - - auto mlp = config; - if (mlp["port"]) - port = mlp["port"].as(); - if (mlp["script"]) - script = mlp["script"].as(); - if (mlp["log_path"]) - log_path = mlp["log_path"].as(); - if (mlp["log_level"]) - log_level = mlp["log_level"].as(); - if (mlp["environment"]) - environment = mlp["environment"].as(); - if (mlp["engine"]) - engine = mlp["engine"].as(); - - if (mlp["command"] && mlp["command"].IsSequence()) { - for (const auto& cmd : mlp["command"]) { - command.push_back(cmd.as()); + try { + YAML::Node config = YAML::LoadFile(filePath); + + // General Metadata + if (config["name"]) name = config["name"].as(); + if (config["version"]) version = config["version"].as(); + + // Model Load Parameters + if (config["engine"]) engine = config["engine"].as(); + if (config["entrypoint"]) entrypoint = config["entrypoint"].as(); + if (config["port"]) port = config["port"].as(); + + // Extra Arguments + if (config["extra_args"] && config["extra_args"].IsSequence()) { + extra_args.clear(); + for (const auto& arg : config["extra_args"]) { + extra_args.push_back(arg.as()); + } } } - - if (mlp["files"] && mlp["files"].IsSequence()) { - for (const auto& file : mlp["files"]) { - files.push_back(file.as()); - } + catch (const YAML::Exception& e) { + throw std::runtime_error("Error parsing YAML file: " + std::string(e.what())); } - - if (mlp["depends"] && mlp["depends"].IsSequence()) { - for (const auto& depend : mlp["depends"]) { - depends.push_back(depend.as()); - } - } - - if (mlp["extra_params"]) { - for (YAML::const_iterator it = mlp["extra_params"].begin(); - it != mlp["extra_params"].end(); ++it) { - extra_params[it->first.as()] = - it->second.as(); - } + catch (const std::exception& e) { + throw std::runtime_error("Error reading YAML file: " + std::string(e.what())); } } // Method to convert the struct to JSON Json::Value ToJson() const { - Json::Value root; - - root["id"] = id; - root["model"] = model; - root["name"] = name; - root["version"] = version; - - // Inference Parameters - root["load_model"]["method"] = load_model.method; - root["load_model"]["path"] = load_model.path; - root["load_model"]["transform_request"] = load_model.transform_request; - root["load_model"]["transform_response"] = load_model.transform_response; - - root["destroy"]["method"] = destroy.method; - root["destroy"]["path"] = destroy.path; - - root["inference"]["method"] = inference.method; - root["inference"]["path"] = inference.path; - - for (const auto& endpoint : extra_endpoints) { - Json::Value e; - e["method"] = endpoint.method; - e["path"] = endpoint.path; - root["extra_endpoints"].append(e); - } - - // Model Load Parameters - root["port"] = port; - root["log_path"] = log_path; - root["log_level"] = log_level; - root["environment"] = environment; - root["script"] = script; - - // Serialize command as JSON array - for (const auto& cmd : command) { - root["command"].append(cmd); - } + Json::Value json; - for (const auto& file : files) { - root["files"].append(file); - } + // Add basic string fields + json["name"] = name; + json["version"] = version; + json["engine"] = engine; + json["entrypoint"] = entrypoint; + json["port"] = port; - for (const auto& depend : depends) { - root["depends"].append(depend); + // Add extra_args array + if (!extra_args.empty()) { + Json::Value args(Json::arrayValue); + for (const auto& arg : extra_args) { + args.append(arg); + } + json["extra_args"] = args; } - root["engine"] = engine; - root["extra_params"] = extra_params; // Serialize the JSON value directly - - return root; + return json; } // Method to populate struct from JSON void FromJson(const Json::Value& root) { - - if (root.isMember("id")) - id = root["id"].asString(); - if (root.isMember("model")) - model = root["model"].asString(); - if (root.isMember("name")) - name = root["name"].asString(); - if (root.isMember("version")) - version = root["version"].asInt(); - - // Inference Parameters - - const Json::Value& ip = root; - if (ip.isMember("load_model")) { - load_model.method = ip["load_model"]["method"].asString(); - load_model.path = ip["load_model"]["path"].asString(); - load_model.transform_request = - ip["load_model"]["transform_request"].asString(); - load_model.transform_response = - ip["load_model"]["transform_response"].asString(); - } - if (ip.isMember("destroy")) { - destroy.method = ip["destroy"]["method"].asString(); - destroy.path = ip["destroy"]["path"].asString(); - } - if (ip.isMember("inference")) { - inference.method = ip["inference"]["method"].asString(); - inference.path = ip["inference"]["path"].asString(); - } - if (ip.isMember("extra_endpoints")) { - for (const auto& endpoint : ip["extra_endpoints"]) { - Endpoint e; - e.method = endpoint["method"].asString(); - e.path = endpoint["path"].asString(); - extra_endpoints.push_back(e); - } - } - - // Model Load Parameters - - const Json::Value& mlp = root; - if (mlp.isMember("port")) - port = mlp["port"].asString(); - if (mlp.isMember("log_path")) - log_path = mlp["log_path"].asString(); - if (mlp.isMember("log_level")) - log_level = mlp["log_level"].asString(); - if (mlp.isMember("environment")) - environment = mlp["environment"].asString(); - if (mlp.isMember("engine")) - engine = mlp["engine"].asString(); - if (mlp.isMember("script")) - script = mlp["script"].asString(); - - if (mlp.isMember("command")) { - for (const auto& cmd : mlp["command"]) { - command.push_back(cmd.asString()); - } - } - - if (mlp.isMember("files")) { - for (const auto& file : mlp["files"]) { - files.push_back(file.asString()); - } + if (!root.isObject()) { + throw std::runtime_error("Input JSON must be an object"); } - - if (mlp.isMember("depends")) { - for (const auto& depend : mlp["depends"]) { - depends.push_back(depend.asString()); + try { + // Basic fields + name = root.get("name", name).asString(); + version = root.get("version", version).asInt(); + engine = root.get("engine", engine).asString(); + entrypoint = root.get("entrypoint", entrypoint).asString(); + port = root.get("port", port).asInt(); + + // Extra args array + extra_args.clear(); + const Json::Value& args = root["extra_args"]; + if (args.isArray()) { + for (const auto& arg : args) { + extra_args.push_back(arg.asString()); + } } - } - - if (mlp.isMember("extra_params")) { - extra_params = mlp["extra_params"]; // Directly assign the JSON value + } catch (const Json::Exception& e) { + throw std::runtime_error("Error parsing JSON: " + std::string(e.what())); + } catch (const std::exception& e) { + throw std::runtime_error("Error processing JSON data: " + std::string(e.what())); } } }; diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc index ac1f55d8f..1f0bb38ce 100644 --- a/engine/controllers/models.cc +++ b/engine/controllers/models.cc @@ -309,7 +309,7 @@ void Models::GetModel(const HttpRequestPtr& req, fs::path(model_entry.value().path_to_model_yaml)) .string()); ret = python_model_config.ToJson(); - ret["id"] = python_model_config.model; + ret["id"] = python_model_config.name; ret["object"] = "model"; ret["result"] = "OK"; auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret); diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 49b2835a5..51d047310 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -65,11 +65,17 @@ cpp::result DownloadUv(std::shared_ptr& down std::string GetUvPath() { return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; } - bool IsUvInstalled() { return std::filesystem::exists(GetUvPath()); } +bool PythonEngine::PythonSubprocess::IsAlive() { + return cortex::process::IsProcessAlive(pid); +} +bool PythonEngine::PythonSubprocess::Kill() { + return cortex::process::KillProcess(pid); +} + PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {} PythonEngine::~PythonEngine() { @@ -111,7 +117,7 @@ void PythonEngine::LoadModel( namespace fs = std::filesystem; - const std::string& model = (*json_body)["model"].asString(); + const std::string model = (*json_body)["model"].asString(); const fs::path model_dir = (*json_body)["model_dir"].asString(); if (model_process_map.find(model) != model_process_map.end()) { @@ -123,24 +129,14 @@ void PythonEngine::LoadModel( pid_t pid; try { - auto model_config = YAML::LoadFile(model_dir / "model.yml"); - if (!model_config["entrypoint"]) - throw std::runtime_error("`entrypoint` is not defined in model.yml"); - if (!model_config["port"]) - throw std::runtime_error("`port` is not defined in model.yaml"); - - const std::string entrypoint = model_config["entrypoint"].as(); - const int port = model_config["port"].as(); + config::PythonModelConfig py_cfg; + py_cfg.ReadFromYaml(model_dir / "model.yml"); // NOTE: model_dir / entrypoint assumes a Python script // TODO: figure out if we can support arbitrary CLI (but still launch by uv) - std::vector command{GetUvPath(), "run", model_dir / entrypoint}; - - auto extra_args_node = model_config["extra_args"]; - if (extra_args_node && extra_args_node.IsSequence()) { - for (int i = 0; i < extra_args_node.size(); i++) - command.push_back(extra_args_node[i].as()); - } + std::vector command{GetUvPath(), "run", model_dir / py_cfg.entrypoint}; + for (const auto& item : py_cfg.extra_args) + command.push_back(item); const std::string stdout_path = model_dir / "stdout.txt"; const std::string stderr_path = model_dir / "stderr.txt"; @@ -155,7 +151,7 @@ void PythonEngine::LoadModel( throw std::runtime_error("Fail to spawn process with pid -1"); } std::unique_lock write_lock(mutex); - model_process_map[model] = {pid, port}; + model_process_map[model] = {pid, py_cfg.port}; } catch (const std::exception& e) { auto e_msg = e.what(); @@ -174,7 +170,56 @@ void PythonEngine::UnloadModel( std::shared_ptr json_body, std::function&& callback) { - assert(false && "Not implemented"); + if (!json_body->isMember("model")) { + auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + + // check if model has started + { + std::shared_lock read_lock(mutex); + + if (model_process_map.find(model) == model_process_map.end()) { + const std::string msg = "Model " + model + " has not been loaded yet."; + auto [status, error] = CreateResponse(msg, k400BadRequest); + callback(std::move(status), std::move(error)); + return; + } + } + + // we know that model has started + { + std::unique_lock write_lock(mutex); + + // check if subprocess is still alive + if (!model_process_map[model].IsAlive()) { + const std::string msg = "Model " + model + " stopped running."; + auto [status, error] = CreateResponse(msg, k400BadRequest); + + // NOTE: do we need to do any other cleanup for subprocesses? + model_process_map.erase(model); + + callback(std::move(status), std::move(error)); + return; + } + + // subprocess is alive. we kill it here. + if (!model_process_map[model].Kill()) { + const std::string msg = "Unable to kill process of model " + model; + auto [status, error] = CreateResponse(msg, k500InternalServerError); + callback(std::move(status), std::move(error)); + return; + } + + // NOTE: do we need to do any other cleanup for subprocesses? + model_process_map.erase(model); + } + + auto [status, res] = CreateResponse("Unload model successfully", k200OK); + callback(std::move(status), std::move(res)); } void PythonEngine::GetModelStatus( @@ -188,7 +233,20 @@ void PythonEngine::GetModels( std::shared_ptr jsonBody, std::function&& callback) { - assert(false && "Not implemented"); + Json::Value res, model_list(Json::arrayValue), status; + for (const auto& item : model_process_map) { + model_list.append(Json::Value{item.first}); + } + + res["object"] = "list"; + res["data"] = model_list; + + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = k200OK; + + callback(std::move(status), std::move(res)); } void PythonEngine::HandleRequest( diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 553f49b9b..71fd170e7 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -45,6 +45,9 @@ class PythonEngine : public PythonEngineI { struct PythonSubprocess { pid_t pid; int port; + + bool IsAlive(); + bool Kill(); }; mutable std::shared_mutex mutex; diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 15d5a8dc6..00bb464dc 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -506,57 +506,8 @@ ModelService::DownloadModelFromCortexsoAsync( config::YamlHandler yaml_handler; yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string()); auto mc = yaml_handler.GetModelConfig(); - if (mc.engine == kPythonEngine) { // process for Python engine - config::PythonModelConfig python_model_config; - python_model_config.ReadFromYaml(model_yml_item->localPath.string()); - python_model_config.files.push_back( - model_yml_item->localPath.parent_path().string()); - python_model_config.ToYaml(model_yml_item->localPath.string()); - // unzip venv.zip - auto model_folder = model_yml_item->localPath.parent_path(); - auto venv_path = model_folder / std::filesystem::path("venv"); - if (!std::filesystem::exists(venv_path)) { - std::filesystem::create_directories(venv_path); - } - auto venv_zip = model_folder / std::filesystem::path("venv.zip"); - if (std::filesystem::exists(venv_zip)) { - if (archive_utils::ExtractArchive(venv_zip.string(), - venv_path.string())) { - std::filesystem::remove_all(venv_zip); - CTL_INF("Successfully extract venv.zip"); - // If extract success create pyvenv.cfg - std::ofstream pyvenv_cfg(venv_path / - std::filesystem::path("pyvenv.cfg")); -#ifdef _WIN32 - pyvenv_cfg << "home = " - << (venv_path / std::filesystem::path("Scripts")).string() - << std::endl; - pyvenv_cfg << "executable = " - << (venv_path / std::filesystem::path("Scripts") / - std::filesystem::path("python.exe")) - .string() - << std::endl; -#else - pyvenv_cfg << "home = " - << (venv_path / std::filesystem::path("bin/")).string() - << std::endl; - pyvenv_cfg - << "executable = " - << (venv_path / std::filesystem::path("bin/python")).string() - << std::endl; -#endif - // Close the file - pyvenv_cfg.close(); - // Add executable permission to python - set_permission_utils::SetExecutePermissionsRecursive(venv_path); - } else { - CTL_ERR("Failed to extract venv.zip"); - }; - } else { - CTL_ERR( - "venv.zip not found in model folder: " << model_folder.string()); - } + if (mc.engine == kPythonEngine) { // process for Python engine } else { mc.model = unique_model_id; @@ -986,21 +937,6 @@ cpp::result ModelService::StopModel( engine_name = kLlamaEngine; } - // Update for python engine - if (engine_name == kPythonEngine) { - auto model_entry = db_service_->GetModelInfo(model_handle); - config::PythonModelConfig python_model_config; - python_model_config.ReadFromYaml( - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string()); - // Stop all depends model - auto depends = python_model_config.depends; - for (auto& depend : depends) { - StopModel(depend); - } - } - // assert(inference_svc_); auto ir = inference_svc_->UnloadModel(engine_name, model_handle); From 29f53444a4ce4b00624c8a2d2e14c0e9f04a2e0d Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 15:09:22 +0800 Subject: [PATCH 12/73] implement PythonEngine::GetModels --- engine/extensions/python-engine/python_engine.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 51d047310..d15355ed3 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -234,8 +234,16 @@ void PythonEngine::GetModels( std::function&& callback) { Json::Value res, model_list(Json::arrayValue), status; - for (const auto& item : model_process_map) { - model_list.append(Json::Value{item.first}); + { + std::shared_lock read_lock(mutex); + for (const auto& [model_name, py_proc] : model_process_map) { + Json::Value val; + val["id"] = model_name; + val["engine"] = kPythonEngine; + val["port"] = py_proc.port; + val["object"] = "model"; + model_list.append(val); + } } res["object"] = "list"; From 7949dccd9816353f96ec185c8e2ea86e8409cbb8 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 15:46:38 +0800 Subject: [PATCH 13/73] implement getModelStatus. add some notes --- .../extensions/python-engine/python_engine.cc | 59 ++++++++++++++++--- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index d15355ed3..113b68bf7 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -120,11 +120,15 @@ void PythonEngine::LoadModel( const std::string model = (*json_body)["model"].asString(); const fs::path model_dir = (*json_body)["model_dir"].asString(); - if (model_process_map.find(model) != model_process_map.end()) { - auto [status, error] = CreateResponse( - "Model already loaded!", k409Conflict); - callback(std::move(status), std::move(error)); - return; + // TODO: check if model is still alive + { + std::shared_lock read_lock(mutex); + if (model_process_map.find(model) != model_process_map.end()) { + auto [status, error] = CreateResponse( + "Model already loaded!", k409Conflict); + callback(std::move(status), std::move(error)); + return; + } } pid_t pid; @@ -181,7 +185,6 @@ void PythonEngine::UnloadModel( // check if model has started { std::shared_lock read_lock(mutex); - if (model_process_map.find(model) == model_process_map.end()) { const std::string msg = "Model " + model + " has not been loaded yet."; auto [status, error] = CreateResponse(msg, k400BadRequest); @@ -226,7 +229,48 @@ void PythonEngine::GetModelStatus( std::shared_ptr json_body, std::function&& callback) { - assert(false && "Not implemented"); + if (!json_body->isMember("model")) { + auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + Json::Value res, status; + + // check if model has started + { + std::shared_lock read_lock(mutex); + if (model_process_map.find(model) == model_process_map.end()) { + const std::string msg = "Model " + model + " has not been loaded yet."; + auto [status, error] = CreateResponse(msg, k400BadRequest); + callback(std::move(status), std::move(error)); + return; + } + } + + // we know that model has started + { + std::unique_lock write_lock(mutex); + + // check if subprocess is still alive + if (!model_process_map[model].IsAlive()) { + const std::string msg = "Model " + model + " stopped running."; + auto [status, error] = CreateResponse(msg, k400BadRequest); + + // NOTE: do we need to do any other cleanup for subprocesses? + model_process_map.erase(model); + + callback(std::move(status), std::move(error)); + return; + } + } + + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = k200OK; + callback(std::move(status), std::move(res)); } void PythonEngine::GetModels( @@ -237,6 +281,7 @@ void PythonEngine::GetModels( { std::shared_lock read_lock(mutex); for (const auto& [model_name, py_proc] : model_process_map) { + // TODO: check if py_proc is still alive Json::Value val; val["id"] = model_name; val["engine"] = kPythonEngine; From e2f0323988a34a43611411e6ed7544a7e0db919a Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 17:27:38 +0800 Subject: [PATCH 14/73] add router for python --- engine/controllers/server.cc | 50 +++++++++++++++++++ engine/controllers/server.h | 6 +++ engine/cortex-common/python_enginei.h | 7 +-- .../extensions/python-engine/python_engine.cc | 29 ++++++++--- .../extensions/python-engine/python_engine.h | 6 +-- engine/services/inference_service.cc | 21 ++------ engine/services/inference_service.h | 4 +- 7 files changed, 85 insertions(+), 38 deletions(-) diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc index a8cff2166..374ca40b8 100644 --- a/engine/controllers/server.cc +++ b/engine/controllers/server.cc @@ -210,6 +210,56 @@ void server::RouteRequest( } } +void server::Python( + const HttpRequestPtr& req, + std::function&& callback, + const std::string& model) { + + const std::string& full_path = req->getPath(); + + const std::string prefix = "/v1/python/"; + if (full_path.substr(0, prefix.size()) != prefix) { + auto resp = cortex_utils::CreateCortexHttpJsonResponse( + Json::Value("Invalid path: must start with " + prefix)); + resp->setStatusCode(k400BadRequest); + callback(resp); + return; + } + + // convert /v1/python/{model}/remaining/path -> /remaning/path + const std::string path = full_path.substr(prefix.size() + model.size()); + + auto port_result = inference_svc_->GetPythonPort(model); + if (port_result.has_error()) { + auto resp = cortex_utils::CreateCortexHttpJsonResponse( + Json::Value(port_result.error())); + resp->setStatusCode(k400BadRequest); + callback(resp); + return; + } + + // route request. localhost might not work? + const int port = port_result.value(); + const std::string host = "http://127.0.0.1:" + std::to_string(port); + auto client = HttpClient::newHttpClient(host); + + auto new_req = HttpRequest::newHttpRequest(); + new_req->setMethod(req->method()); + new_req->setPath(path); + new_req->setBody(std::string{req->body()}); + new_req->setContentTypeCode(req->getContentType()); + + for (const auto& [field, value] : req->headers()) { + new_req->addHeader(field, value); + } + + CTL_INF("Route request to " << host << path); + auto cb = [callback](ReqResult result, const HttpResponsePtr& response) { + callback(response); + }; + client->sendRequest(new_req, cb); +} + void server::LoadModel(const HttpRequestPtr& req, std::function&& callback) { auto ir = inference_svc_->LoadModel(req->getJsonObject()); diff --git a/engine/controllers/server.h b/engine/controllers/server.h index 42214a641..99b545d0b 100644 --- a/engine/controllers/server.h +++ b/engine/controllers/server.h @@ -49,6 +49,8 @@ class server : public drogon::HttpController, ADD_METHOD_TO(server::Inference, "/v1/inference", Options, Post); ADD_METHOD_TO(server::RouteRequest, "/v1/route/request", Options, Post); + ADD_METHOD_TO(server::Python, "/v1/python/{1}/.*", Options, Get, Post); + METHOD_LIST_END void ChatCompletion( @@ -76,6 +78,10 @@ class server : public drogon::HttpController, std::function&& callback); void RouteRequest(const HttpRequestPtr& req, std::function&& callback); + void Python( + const HttpRequestPtr& req, + std::function&& callback, + const std::string& model); private: void ProcessStreamRes(std::function cb, diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h index 481b6b146..b0a02d8cc 100644 --- a/engine/cortex-common/python_enginei.h +++ b/engine/cortex-common/python_enginei.h @@ -4,6 +4,7 @@ #include #include "json/value.h" +#include "utils/result.hpp" class PythonEngineI { public: @@ -25,9 +26,5 @@ class PythonEngineI { std::shared_ptr jsonBody, std::function&& callback) = 0; - virtual void HandleRequest( - const std::string& model, - const std::vector& path_parts, - std::shared_ptr json_body, - std::function&& callback) = 0; + virtual cpp::result GetPort(const std::string& model) = 0; }; diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 113b68bf7..396483013 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -302,14 +302,29 @@ void PythonEngine::GetModels( callback(std::move(status), std::move(res)); } -void PythonEngine::HandleRequest( - const std::string& model, - const std::vector& path_parts, - std::shared_ptr json_body, - std::function&& callback) { +cpp::result PythonEngine::GetPort(const std::string& model) { + int port; + + // check if model has started + { + std::shared_lock read_lock(mutex); + if (model_process_map.find(model) == model_process_map.end()) { + return cpp::fail("Model " + model + " has not been loaded yet."); + } + port = model_process_map[model].port; + } + + // check if subprocess is still alive + { + std::unique_lock write_lock(mutex); + if (!model_process_map[model].IsAlive()) { + // NOTE: do we need to do any other cleanup for subprocesses? + model_process_map.erase(model); + return cpp::fail("Model " + model + " stopped running."); + } + } - assert(false && "Not implemented"); - // get port + return port; } } // namespace python_engine diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 71fd170e7..a79b3cedc 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -71,10 +71,6 @@ class PythonEngine : public PythonEngineI { std::shared_ptr jsonBody, std::function&& callback) override; - void HandleRequest( - const std::string& model, - const std::vector& path_parts, - std::shared_ptr json_body, - std::function&& callback) override; + cpp::result GetPort(const std::string& model) override; }; } // namespace python_engine diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index b33374ad3..431df8941 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -199,28 +199,13 @@ cpp::result InferenceService::HandleRouteRequest( return {}; } -InferResult InferenceService::HandlePython( - const std::string& model, const std::vector& path_parts, - std::shared_ptr json_body) { - - Json::Value stt, res; - +cpp::result InferenceService::GetPythonPort(const std::string& model) { auto engine_result = engine_service_->GetLoadedEngine(kPythonEngine); if (engine_result.has_error()) { - res["message"] = "Python engine is not loaded yet"; - stt["status_code"] = drogon::k400BadRequest; - LOG_WARN << "Python engine is not loaded yet"; - return std::make_pair(stt, res); + return cpp::fail("Python engine is not loaded yet"); } - auto cb = [&stt, &res](Json::Value s, Json::Value r) { - stt = s; - res = r; - }; - std::get(engine_result.value()) - ->HandleRequest(model, path_parts, json_body, cb); - - return std::make_pair(stt, res); + return std::get(engine_result.value())->GetPort(model); } InferResult InferenceService::LoadModel( diff --git a/engine/services/inference_service.h b/engine/services/inference_service.h index 874ce8c85..e71fbc7e7 100644 --- a/engine/services/inference_service.h +++ b/engine/services/inference_service.h @@ -48,9 +48,7 @@ class InferenceService { cpp::result HandleRouteRequest( std::shared_ptr q, std::shared_ptr json_body); - InferResult HandlePython( - const std::string& model, const std::vector& path_parts, - std::shared_ptr json_body); + cpp::result GetPythonPort(const std::string& model); InferResult LoadModel(std::shared_ptr json_body); From 607d2cbb8d873a19e22a4eb8ed447ece6a90c69d Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 17:58:40 +0800 Subject: [PATCH 15/73] call PythonEngine destructor --- engine/extensions/python-engine/python_engine.cc | 8 ++++++-- engine/extensions/python-engine/python_engine.h | 4 ---- engine/services/engine_service.cc | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 396483013..65b4d53c1 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -76,10 +76,14 @@ bool PythonEngine::PythonSubprocess::Kill() { return cortex::process::KillProcess(pid); } -PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {} +PythonEngine::PythonEngine() {} PythonEngine::~PythonEngine() { - curl_global_cleanup(); + // NOTE: what happens if we can't kill subprocess? + std::unique_lock write_lock(mutex); + for (auto& [model_name, py_proc] : model_process_map) { + if (py_proc.IsAlive()) py_proc.Kill(); + } } static std::pair CreateResponse( diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index a79b3cedc..0da0c8412 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -39,9 +39,6 @@ bool IsUvInstalled(); class PythonEngine : public PythonEngineI { private: - // extensions::TemplateRenderer renderer_; - // std::unique_ptr async_file_logger_; - struct PythonSubprocess { pid_t pid; int port; @@ -52,7 +49,6 @@ class PythonEngine : public PythonEngineI { mutable std::shared_mutex mutex; std::unordered_map model_process_map; - trantor::ConcurrentTaskQueue q_; public: PythonEngine(); diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 247e195be..9666c93ad 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -884,6 +884,8 @@ cpp::result EngineService::UnloadEngine( auto unload_opts = EngineI::EngineUnloadOption{}; e->Unload(unload_opts); delete e; + } else if (std::holds_alternative(engines_[ne].engine)) { + delete std::get(engines_[ne].engine); } else { delete std::get(engines_[ne].engine); } From f58b77327e91d39b2b93186ec046b5fee7316c05 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 18:06:57 +0800 Subject: [PATCH 16/73] remove unused method --- engine/services/engine_service.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index a8d5415a0..18631c279 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -168,9 +168,6 @@ class EngineService : public EngineServiceI { const std::string& version = "latest", const std::optional variant_name = std::nullopt); - cpp::result DownloadPythonUv( - const std::string& version = "latest"); - cpp::result DownloadCuda(const std::string& engine, bool async = false); From bf23c9f01070ce0dbe1c7ccecacf6b9a7be8db33 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Feb 2025 19:44:14 +0800 Subject: [PATCH 17/73] remove unnecessary headers --- .../extensions/python-engine/python_engine.cc | 8 +++---- .../extensions/python-engine/python_engine.h | 22 ------------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 65b4d53c1..f61414bca 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -1,8 +1,9 @@ #include "python_engine.h" #include -#include -#include -#include + +#include "config/model_config.h" +#include "utils/file_manager_utils.h" +#include "utils/process/utils.h" namespace python_engine { namespace { @@ -10,7 +11,6 @@ constexpr const int k200OK = 200; constexpr const int k400BadRequest = 400; constexpr const int k409Conflict = 409; constexpr const int k500InternalServerError = 500; -constexpr const int kFileLoggerOption = 0; } // namespace cpp::result DownloadUv(std::shared_ptr& download_service) { diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 0da0c8412..988ccf9a1 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -1,36 +1,14 @@ #pragma once -#include #include -#include -#include #include #include #include -#include "config/model_config.h" -#include "trantor/utils/ConcurrentTaskQueue.h" #include "cortex-common/python_enginei.h" -#include "extensions/template_renderer.h" -#include "utils/file_logger.h" -#include "utils/file_manager_utils.h" -#include "utils/process_status_utils.h" -#include "utils/curl_utils.h" -#include "utils/process/utils.h" #include "services/download_service.h" -// Helper for CURL response namespace python_engine { -struct StreamContext { - std::shared_ptr> callback; - std::string buffer; -}; - -struct CurlResponse { - std::string body; - bool error{false}; - std::string error_message; -}; // UV-related functions cpp::result DownloadUv(std::shared_ptr& download_service); From 8ebee7cf9bbeeda224a0fdd50eb4946a38f252b0 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Feb 2025 08:31:08 +0800 Subject: [PATCH 18/73] remove unused stuff --- engine/cortex-common/python_enginei.h | 3 --- engine/services/inference_service.cc | 2 +- engine/services/model_service.cc | 18 +++++++----------- engine/utils/config_yaml_utils.h | 3 +-- engine/utils/engine_constants.h | 3 --- 5 files changed, 9 insertions(+), 20 deletions(-) diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h index b0a02d8cc..ffde3d41b 100644 --- a/engine/cortex-common/python_enginei.h +++ b/engine/cortex-common/python_enginei.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include "json/value.h" #include "utils/result.hpp" @@ -10,8 +9,6 @@ class PythonEngineI { public: virtual ~PythonEngineI() {} - // virtual bool IsSupported(const std::string& f) = 0; - // model management virtual void LoadModel( std::shared_ptr json_body, diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index 431df8941..2b241692a 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -375,7 +375,7 @@ InferResult InferenceService::GetModels( InferResult InferenceService::FineTuning( std::shared_ptr json_body) { - std::string ne = kPythonRuntimeRepo; + std::string ne = kPythonEngine; Json::Value r; Json::Value stt; diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 9d02038b4..f0c6e760b 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -507,19 +507,15 @@ ModelService::DownloadModelFromCortexsoAsync( yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string()); auto mc = yaml_handler.GetModelConfig(); - if (mc.engine == kPythonEngine) { // process for Python engine + mc.model = unique_model_id; - } else { - mc.model = unique_model_id; - - uint64_t model_size = 0; - for (const auto& item : finishedTask.items) { - model_size = model_size + item.bytes.value_or(0); - } - mc.size = model_size; - yaml_handler.UpdateModelConfig(mc); - yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); + uint64_t model_size = 0; + for (const auto& item : finishedTask.items) { + model_size = model_size + item.bytes.value_or(0); } + mc.size = model_size; + yaml_handler.UpdateModelConfig(mc); + yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); auto rel = file_manager_utils::ToRelativeCortexDataPath(model_yml_item->localPath); diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h index 1749cd2d0..bb0f8f2d0 100644 --- a/engine/utils/config_yaml_utils.h +++ b/engine/utils/config_yaml_utils.h @@ -24,8 +24,7 @@ constexpr const auto kDefaultCorsEnabled = true; const std::vector kDefaultEnabledOrigins{ "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"}; constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1"; -const std::vector kDefaultSupportedEngines{kLlamaEngine, - kPythonEngine}; +const std::vector kDefaultSupportedEngines{kLlamaEngine}; struct CortexConfig { std::string logFolderPath; diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 3cad230bc..10d19b160 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -12,10 +12,7 @@ constexpr const auto kLocal = "local"; constexpr const auto kLlamaRepo = "cortex.llamacpp"; -constexpr const auto kPythonRuntimeRepo = "cortex.python"; - constexpr const auto kLlamaLibPath = "./engines/cortex.llamacpp"; -constexpr const auto kPythonRuntimeLibPath = "/engines/cortex.python"; // other constants constexpr auto static kHuggingFaceHost = "huggingface.co"; From 8f36adcddb6ccf18216ddd1b015861854a14cabc Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Feb 2025 09:16:05 +0800 Subject: [PATCH 19/73] download uv directly from github release --- .../extensions/python-engine/python_engine.cc | 71 ++++++++++--------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index f61414bca..5805931bf 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -4,6 +4,9 @@ #include "config/model_config.h" #include "utils/file_manager_utils.h" #include "utils/process/utils.h" +#include "utils/system_info_utils.h" +#include "utils/archive_utils.h" +#include "utils/set_permission_utils.h" namespace python_engine { namespace { @@ -14,46 +17,47 @@ constexpr const int k500InternalServerError = 500; } // namespace cpp::result DownloadUv(std::shared_ptr& download_service) { - const std::string py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; + const auto py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; std::filesystem::create_directories(py_bin_path); - const std::string uv_version = "0.5.31"; + // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release? + const std::string uv_version = "0.6.2"; + + // build download url based on system info + std::stringstream fname_stream; + fname_stream << "uv-"; + + auto system_info = system_info_utils::GetSystemInfo(); + if (system_info->arch == "amd64") fname_stream << "x86_64"; + else if (system_info->arch == "arm64") fname_stream << "aarch64"; + + // NOTE: there is also a musl linux version + if (system_info->os == kMacOs) fname_stream << "-apple-darwin.tar.gz"; + else if (system_info->os == kWindowsOs) fname_stream << "-pc-windows-msvc.zip"; + else if (system_info->os == kLinuxOs) fname_stream << "-unknown-linux-gnu.tar.gz"; + + const std::string fname = fname_stream.str(); + const std::string base_url = "https://github.com/astral-sh/uv/releases/download/"; + const std::string url = (std::stringstream{} << base_url << uv_version << "/" << fname).str(); + CTL_INF("Download uv from " << url); - // NOTE: only works on MacOS and Linux auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) { // try to unzip the downloaded file - const std::string installer_path = finishedTask.items[0].localPath.string(); - CTL_INF("UV install script path: " << installer_path); - CTL_INF("Version: " << uv_version); - - // https://docs.astral.sh/uv/configuration/installer/ - // TODO: move env var mod logic to SpawnProcess() - // using env to set env vars - // should we download from here instead? https://github.com/astral-sh/uv/releases - std::vector command{"env", - "UV_UNMANAGED_INSTALL=" + py_bin_path, - "sh", - installer_path, - "-q"}; - const auto pid = cortex::process::SpawnProcess(command); - if (pid == -1) { - CTL_ERR("Failed to install uv"); - } - // wait for subprocess to finish - // TODO: need to check return status if successful - waitpid(pid, NULL, 0); - std::filesystem::remove(installer_path); + const std::string download_path = finishedTask.items[0].localPath.string(); + + archive_utils::ExtractArchive(download_path, py_bin_path, true); + set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); + std::filesystem::remove(download_path); }; - const std::string url = "https://astral.sh/uv/" + uv_version + "/install.sh"; - auto downloadTask = - DownloadTask{.id = "uv", - .type = DownloadType::Engine, - .items = {DownloadItem{ - .id = "uv", - .downloadUrl = url, - .localPath = py_bin_path + "/install.sh", - }}}; + auto downloadTask = DownloadTask{.id = "uv", + .type = DownloadType::Engine, + .items = { + DownloadItem{ + .id = "uv", + .downloadUrl = url, + .localPath = py_bin_path / fname, + }}}; auto add_task_result = download_service->AddTask(downloadTask, on_finished); if (add_task_result.has_error()) { @@ -63,6 +67,7 @@ cpp::result DownloadUv(std::shared_ptr& down } std::string GetUvPath() { + // NOTE: do I need to add .exe for windows? return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; } bool IsUvInstalled() { From 5ebfbb73e85fb41d10e98b4baa3557ba0a82ecf0 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Feb 2025 17:42:34 +0800 Subject: [PATCH 20/73] check for entrypoint --- engine/extensions/python-engine/python_engine.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 5805931bf..dc4b52ef8 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -145,6 +145,10 @@ void PythonEngine::LoadModel( config::PythonModelConfig py_cfg; py_cfg.ReadFromYaml(model_dir / "model.yml"); + if (py_cfg.entrypoint == "") { + throw std::runtime_error("Missing entrypoint in model.yml"); + } + // NOTE: model_dir / entrypoint assumes a Python script // TODO: figure out if we can support arbitrary CLI (but still launch by uv) std::vector command{GetUvPath(), "run", model_dir / py_cfg.entrypoint}; @@ -158,7 +162,8 @@ void PythonEngine::LoadModel( if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush(); if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush(); - // TODO: what happens if the process starts, but exits? + // NOTE: process may start, but exits/crashes later + // TODO: wait for a few seconds, then check if process is alive pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path); if (pid == -1) { throw std::runtime_error("Fail to spawn process with pid -1"); From 5d310d121b7ecdfdf49debfd3d13255e2894db39 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Feb 2025 19:42:18 +0800 Subject: [PATCH 21/73] only record model size for llama.cpp --- engine/services/model_service.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index f0c6e760b..142933ff6 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -507,15 +507,17 @@ ModelService::DownloadModelFromCortexsoAsync( yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string()); auto mc = yaml_handler.GetModelConfig(); - mc.model = unique_model_id; + if (mc.engine == kLlamaEngine) { + mc.model = unique_model_id; - uint64_t model_size = 0; - for (const auto& item : finishedTask.items) { - model_size = model_size + item.bytes.value_or(0); + uint64_t model_size = 0; + for (const auto& item : finishedTask.items) { + model_size = model_size + item.bytes.value_or(0); + } + mc.size = model_size; + yaml_handler.UpdateModelConfig(mc); + yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); } - mc.size = model_size; - yaml_handler.UpdateModelConfig(mc); - yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); auto rel = file_manager_utils::ToRelativeCortexDataPath(model_yml_item->localPath); From c4c622cb0f70c0bc86f86fbd938fcf2dabb7e4f0 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Feb 2025 20:17:08 +0800 Subject: [PATCH 22/73] don't include headers --- engine/controllers/server.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc index 374ca40b8..0b01d06e1 100644 --- a/engine/controllers/server.cc +++ b/engine/controllers/server.cc @@ -249,9 +249,10 @@ void server::Python( new_req->setBody(std::string{req->body()}); new_req->setContentTypeCode(req->getContentType()); - for (const auto& [field, value] : req->headers()) { - new_req->addHeader(field, value); - } + // including headers may make FastAPI reqject the request... + // for (const auto& [field, value] : req->headers()) { + // new_req->addHeader(field, value); + // } CTL_INF("Route request to " << host << path); auto cb = [callback](ReqResult result, const HttpResponsePtr& response) { From 6b59878399e6566741a8af107c02e931c4b2ec5c Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 09:23:36 +0800 Subject: [PATCH 23/73] don't use std::optional to support < c++17 --- engine/utils/process/utils.cc | 20 +++++++++----------- engine/utils/process/utils.h | 4 ++-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 624b62262..da61661a6 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -40,8 +40,8 @@ std::vector ConvertToArgv(const std::vector& args) { } pid_t SpawnProcess(const std::vector& command, - const std::optional stdout_file, - const std::optional stderr_file) { + const std::string stdout_file, + const std::string stderr_file) { try { #if defined(_WIN32) // Windows process creation @@ -90,25 +90,23 @@ pid_t SpawnProcess(const std::vector& command, // caller should make sure the redirect files exist. posix_spawn_file_actions_t *action_ptr = NULL; - if (stdout_file.has_value() || stderr_file.has_value()) { + if (!stdout_file.empty() || !stderr_file.empty()) { posix_spawn_file_actions_t action; posix_spawn_file_actions_init(&action); action_ptr = &action; - if (stdout_file.has_value()) { - std::string stdout_file_val = stdout_file.value(); - if (std::filesystem::exists(stdout_file_val)) { + if (!stdout_file.empty()) { + if (std::filesystem::exists(stdout_file)) { posix_spawn_file_actions_addopen(&action, STDOUT_FILENO, - stdout_file_val.data(), + stdout_file.data(), O_WRONLY | O_APPEND, 0); } } - if (stderr_file.has_value()) { - std::string stderr_file_val = stderr_file.value(); - if (std::filesystem::exists(stderr_file_val)) { + if (!stderr_file.empty()) { + if (std::filesystem::exists(stderr_file)) { posix_spawn_file_actions_addopen(&action, STDERR_FILENO, - stderr_file_val.data(), + stderr_file.data(), O_WRONLY | O_APPEND, 0); } } diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index 813d53750..d59e50103 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -21,8 +21,8 @@ std::string ConstructWindowsCommandLine(const std::vector& args); std::vector ConvertToArgv(const std::vector& args); pid_t SpawnProcess(const std::vector& command, - const std::optional stdout_file = {}, - const std::optional stderr_file = {}); + const std::string stdout_file = "", + const std::string stderr_file = ""); bool IsProcessAlive(pid_t pid); bool KillProcess(pid_t pid); From 250a2ac8682080dce9e57bf425b99a6f8baadc5b Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 09:24:55 +0800 Subject: [PATCH 24/73] fix stringstream usage --- engine/extensions/python-engine/python_engine.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index dc4b52ef8..b58a64eff 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -38,7 +38,10 @@ cpp::result DownloadUv(std::shared_ptr& down const std::string fname = fname_stream.str(); const std::string base_url = "https://github.com/astral-sh/uv/releases/download/"; - const std::string url = (std::stringstream{} << base_url << uv_version << "/" << fname).str(); + + std::stringstream url_stream; + url_stream << base_url << uv_version << "/" << fname; + const std::string url = url_stream.str(); CTL_INF("Download uv from " << url); auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) { From bb38a563c64491f480f8b09bc5fa812c4f1cb35f Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 09:43:39 +0800 Subject: [PATCH 25/73] define pid_t for windows --- engine/extensions/python-engine/python_engine.cc | 1 - engine/extensions/python-engine/python_engine.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index b58a64eff..c380a45be 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -3,7 +3,6 @@ #include "config/model_config.h" #include "utils/file_manager_utils.h" -#include "utils/process/utils.h" #include "utils/system_info_utils.h" #include "utils/archive_utils.h" #include "utils/set_permission_utils.h" diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 988ccf9a1..c1b5ec0dd 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -7,6 +7,7 @@ #include "cortex-common/python_enginei.h" #include "services/download_service.h" +#include "utils/process/utils.h" namespace python_engine { From 723c5db5646a23d2fc11d7f40e7a09ce41554ab7 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 10:09:04 +0800 Subject: [PATCH 26/73] explicit call .string() on filesystem::path to support windows --- engine/extensions/python-engine/python_engine.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index c380a45be..a414f1cd5 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -47,7 +47,7 @@ cpp::result DownloadUv(std::shared_ptr& down // try to unzip the downloaded file const std::string download_path = finishedTask.items[0].localPath.string(); - archive_utils::ExtractArchive(download_path, py_bin_path, true); + archive_utils::ExtractArchive(download_path, py_bin_path.string(), true); set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); std::filesystem::remove(download_path); }; @@ -70,7 +70,9 @@ cpp::result DownloadUv(std::shared_ptr& down std::string GetUvPath() { // NOTE: do I need to add .exe for windows? - return file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; + const auto path = file_manager_utils::GetCortexDataPath() + / "python_engine" / "bin" / "uv"; + return path.string(); } bool IsUvInstalled() { return std::filesystem::exists(GetUvPath()); @@ -145,7 +147,7 @@ void PythonEngine::LoadModel( pid_t pid; try { config::PythonModelConfig py_cfg; - py_cfg.ReadFromYaml(model_dir / "model.yml"); + py_cfg.ReadFromYaml((model_dir / "model.yml").string()); if (py_cfg.entrypoint == "") { throw std::runtime_error("Missing entrypoint in model.yml"); @@ -153,12 +155,13 @@ void PythonEngine::LoadModel( // NOTE: model_dir / entrypoint assumes a Python script // TODO: figure out if we can support arbitrary CLI (but still launch by uv) - std::vector command{GetUvPath(), "run", model_dir / py_cfg.entrypoint}; + const std::string entrypoint = (model_dir / py_cfg.entrypoint).string(); + std::vector command{GetUvPath(), "run", entrypoint}; for (const auto& item : py_cfg.extra_args) command.push_back(item); - const std::string stdout_path = model_dir / "stdout.txt"; - const std::string stderr_path = model_dir / "stderr.txt"; + const std::string stdout_path = (model_dir / "stdout.txt").string(); + const std::string stderr_path = (model_dir / "stderr.txt").string(); // create empty stdout.txt and stderr.txt for redirection if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush(); From 26ec20a29182eaf482fe42df9216b3d3a7009270 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 10:45:40 +0800 Subject: [PATCH 27/73] include extra_args in entrypoint --- engine/config/model_config.h | 62 +++++++------------ .../extensions/python-engine/python_engine.cc | 13 ++-- 2 files changed, 31 insertions(+), 44 deletions(-) diff --git a/engine/config/model_config.h b/engine/config/model_config.h index 85335c37b..c7abf75f1 100644 --- a/engine/config/model_config.h +++ b/engine/config/model_config.h @@ -477,40 +477,33 @@ struct Endpoint { }; struct PythonModelConfig { - // General Metadata std::string name; int version; - - // Model Load Parameters std::string engine; - std::string entrypoint; + + std::vector entrypoint; int port; - std::vector extra_args; // Method to convert C++ struct to YAML void ToYaml(const std::string& filepath) const { YAML::Emitter out; out << YAML::BeginMap; - // General Metadata out << YAML::Key << "name" << YAML::Value << name; out << YAML::Key << "version" << YAML::Value << version; - - // Model Load Parameters out << YAML::Key << "engine" << YAML::Value << engine; - out << YAML::Key << "entrypoint" << YAML::Value << entrypoint; - out << YAML::Key << "port" << YAML::Value << port; - // Extra Arguments - if (!extra_args.empty()) { - out << YAML::Key << "extra_args" << YAML::Value << YAML::BeginSeq; - for (const auto& arg : extra_args) { + // entrypoint + if (!entrypoint.empty()) { + out << YAML::Key << "entrypoint" << YAML::Value << YAML::BeginSeq; + for (const auto& arg : entrypoint) { out << arg; } out << YAML::EndSeq; } out << YAML::EndMap; + out << YAML::Key << "port" << YAML::Value << port; // Write to file std::ofstream fout(filepath); @@ -525,22 +518,18 @@ struct PythonModelConfig { try { YAML::Node config = YAML::LoadFile(filePath); - // General Metadata if (config["name"]) name = config["name"].as(); if (config["version"]) version = config["version"].as(); - - // Model Load Parameters if (config["engine"]) engine = config["engine"].as(); - if (config["entrypoint"]) entrypoint = config["entrypoint"].as(); - if (config["port"]) port = config["port"].as(); - // Extra Arguments - if (config["extra_args"] && config["extra_args"].IsSequence()) { - extra_args.clear(); - for (const auto& arg : config["extra_args"]) { - extra_args.push_back(arg.as()); + // entrypoint + if (config["entrypoint"] && config["entrypoint"].IsSequence()) { + entrypoint.clear(); + for (const auto& arg : config["entrypoint"]) { + entrypoint.push_back(arg.as()); } } + if (config["port"]) port = config["port"].as(); } catch (const YAML::Exception& e) { throw std::runtime_error("Error parsing YAML file: " + std::string(e.what())); @@ -554,21 +543,19 @@ struct PythonModelConfig { Json::Value ToJson() const { Json::Value json; - // Add basic string fields json["name"] = name; json["version"] = version; json["engine"] = engine; - json["entrypoint"] = entrypoint; - json["port"] = port; - // Add extra_args array - if (!extra_args.empty()) { + // entrypoint + if (!entrypoint.empty()) { Json::Value args(Json::arrayValue); - for (const auto& arg : extra_args) { + for (const auto& arg : entrypoint) { args.append(arg); } - json["extra_args"] = args; + json["entrypoint"] = args; } + json["port"] = port; return json; } @@ -579,21 +566,20 @@ struct PythonModelConfig { throw std::runtime_error("Input JSON must be an object"); } try { - // Basic fields name = root.get("name", name).asString(); version = root.get("version", version).asInt(); engine = root.get("engine", engine).asString(); - entrypoint = root.get("entrypoint", entrypoint).asString(); - port = root.get("port", port).asInt(); - // Extra args array - extra_args.clear(); - const Json::Value& args = root["extra_args"]; + // entrypoint + entrypoint.clear(); + const Json::Value& args = root["entrypoint"]; if (args.isArray()) { for (const auto& arg : args) { - extra_args.push_back(arg.asString()); + entrypoint.push_back(arg.asString()); } } + port = root.get("port", port).asInt(); + } catch (const Json::Exception& e) { throw std::runtime_error("Error parsing JSON: " + std::string(e.what())); } catch (const std::exception& e) { diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index a414f1cd5..ffe873b71 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -149,15 +149,16 @@ void PythonEngine::LoadModel( config::PythonModelConfig py_cfg; py_cfg.ReadFromYaml((model_dir / "model.yml").string()); - if (py_cfg.entrypoint == "") { + if (py_cfg.entrypoint.empty()) { throw std::runtime_error("Missing entrypoint in model.yml"); } - // NOTE: model_dir / entrypoint assumes a Python script - // TODO: figure out if we can support arbitrary CLI (but still launch by uv) - const std::string entrypoint = (model_dir / py_cfg.entrypoint).string(); - std::vector command{GetUvPath(), "run", entrypoint}; - for (const auto& item : py_cfg.extra_args) + // https://docs.astral.sh/uv/reference/cli/#uv-run + std::vector command{GetUvPath(), + "run", + "--directory", + model_dir.string()}; + for (const auto& item : py_cfg.entrypoint) command.push_back(item); const std::string stdout_path = (model_dir / "stdout.txt").string(); From 376deeb760599150e415cc4026978b440419983e Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 11:47:35 +0800 Subject: [PATCH 28/73] add python engine install test --- engine/e2e-test/test_api_engine.py | 16 +++++++++++++--- .../e2e-test/test_api_engine_install_nightly.py | 4 ++++ engine/e2e-test/test_cli_engine_install.py | 11 +++++++++++ .../e2e-test/test_cli_engine_install_nightly.py | 11 +++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/engine/e2e-test/test_api_engine.py b/engine/e2e-test/test_api_engine.py index e652e4495..e94b85f51 100644 --- a/engine/e2e-test/test_api_engine.py +++ b/engine/e2e-test/test_api_engine.py @@ -20,12 +20,12 @@ def setup_and_teardown(self): # Teardown stop_server() - + # engines get def test_engines_get_llamacpp_should_be_successful(self): response = requests.get("http://localhost:3928/engines/llama-cpp") assert response.status_code == 200 - + # engines install def test_engines_install_llamacpp_specific_version_and_variant(self): data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx-cuda-11-7"} @@ -40,7 +40,7 @@ def test_engines_install_llamacpp_specific_version_and_null_variant(self): "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) assert response.status_code == 200 - + # engines uninstall @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_should_be_successful(self): @@ -52,6 +52,16 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self): response = requests.delete("http://localhost:3928/v1/engines/llama-cpp/install") assert response.status_code == 200 + @pytest.mark.asyncio + async def test_engines_install_uninstall_python_should_be_successful(self): + response = requests.post("http://localhost:3928/v1/engines/python/install") + assert response.status_code == 200 + await wait_for_websocket_download_success_event(timeout=None) + time.sleep(30) + + response = requests.delete("http://localhost:3928/v1/engines/python/install") + assert response.status_code == 200 + @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self): # install first diff --git a/engine/e2e-test/test_api_engine_install_nightly.py b/engine/e2e-test/test_api_engine_install_nightly.py index de4914c28..4f13d95c8 100644 --- a/engine/e2e-test/test_api_engine_install_nightly.py +++ b/engine/e2e-test/test_api_engine_install_nightly.py @@ -22,6 +22,10 @@ def test_engines_install_llamacpp_should_be_successful(self): response = requests.post("http://localhost:3928/v1/engines/llama-cpp/install") assert response.status_code == 200 + def test_engines_install_python_should_be_successful(self): + response = requests.post("http://localhost:3928/v1/engines/python/install") + assert response.status_code == 200 + def test_engines_install_llamacpp_specific_version_and_variant(self): data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx-cuda-11-7"} response = requests.post( diff --git a/engine/e2e-test/test_cli_engine_install.py b/engine/e2e-test/test_cli_engine_install.py index aeeabd64d..2a23c8866 100644 --- a/engine/e2e-test/test_cli_engine_install.py +++ b/engine/e2e-test/test_cli_engine_install.py @@ -31,6 +31,17 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" + def test_engines_install_python_should_be_successfully(self): + exit_code, output, error = run( + "Install Engine", + ["engines", "install", "python"], + timeout=None, + capture=False, + ) + response = requests.get("http://127.0.0.1:3928/v1/engines/python") + assert len(response.json()) > 0 + assert exit_code == 0, f"Install engine failed with error: {error}" + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_install_onnx_on_macos_should_be_failed(self): exit_code, output, error = run( diff --git a/engine/e2e-test/test_cli_engine_install_nightly.py b/engine/e2e-test/test_cli_engine_install_nightly.py index 80490ab55..09af3ab2f 100644 --- a/engine/e2e-test/test_cli_engine_install_nightly.py +++ b/engine/e2e-test/test_cli_engine_install_nightly.py @@ -31,6 +31,17 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" + def test_engines_install_python_should_be_successfully(self): + exit_code, output, error = run( + "Install Engine", + ["engines", "install", "python"], + timeout=None, + capture=False, + ) + response = requests.get("http://127.0.0.1:3928/v1/engines/python") + assert len(response.json()) > 0 + assert exit_code == 0, f"Install engine failed with error: {error}" + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_install_onnx_on_macos_should_be_failed(self): exit_code, output, error = run( From a9ed820cdf01dda8746767196374d61bfe991b9c Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 12:00:52 +0800 Subject: [PATCH 29/73] add start time --- engine/extensions/python-engine/python_engine.cc | 8 +++++++- engine/extensions/python-engine/python_engine.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index ffe873b71..eb09eec98 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -174,8 +174,10 @@ void PythonEngine::LoadModel( if (pid == -1) { throw std::runtime_error("Fail to spawn process with pid -1"); } + const uint64_t start_time = std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); std::unique_lock write_lock(mutex); - model_process_map[model] = {pid, py_cfg.port}; + model_process_map[model] = {pid, py_cfg.port, start_time}; } catch (const std::exception& e) { auto e_msg = e.what(); @@ -305,8 +307,12 @@ void PythonEngine::GetModels( Json::Value val; val["id"] = model_name; val["engine"] = kPythonEngine; + val["start_time"] = py_proc.start_time; val["port"] = py_proc.port; val["object"] = "model"; + // TODO + // val["ram"]; + // val["vram"]; model_list.append(val); } } diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index c1b5ec0dd..6189da05f 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -21,6 +21,7 @@ class PythonEngine : public PythonEngineI { struct PythonSubprocess { pid_t pid; int port; + uint64_t start_time; bool IsAlive(); bool Kill(); From db8213438ae04c2a79a87943ff3a09ef693ddbf8 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Feb 2025 16:26:24 +0800 Subject: [PATCH 30/73] add back python engine to default supported engine so that cortex engines install work --- engine/utils/config_yaml_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h index bb0f8f2d0..502fdc166 100644 --- a/engine/utils/config_yaml_utils.h +++ b/engine/utils/config_yaml_utils.h @@ -24,7 +24,7 @@ constexpr const auto kDefaultCorsEnabled = true; const std::vector kDefaultEnabledOrigins{ "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"}; constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1"; -const std::vector kDefaultSupportedEngines{kLlamaEngine}; +const std::vector kDefaultSupportedEngines{kLlamaEngine, kPythonEngine}; struct CortexConfig { std::string logFolderPath; From 79464a230e25c40bad726846ed50a16e56bf8f2b Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 25 Feb 2025 12:18:50 +0800 Subject: [PATCH 31/73] format --- engine/config/model_config.h | 37 ++++---- engine/controllers/server.cc | 11 +-- engine/controllers/server.h | 7 +- engine/cortex-common/python_enginei.h | 16 ++-- .../extensions/python-engine/python_engine.cc | 94 ++++++++++--------- .../extensions/python-engine/python_engine.h | 19 ++-- engine/services/engine_service.cc | 7 +- engine/services/engine_service.h | 1 - engine/services/inference_service.cc | 37 ++++---- engine/services/model_service.cc | 6 +- engine/utils/process/utils.cc | 6 +- engine/utils/process/utils.h | 4 +- 12 files changed, 128 insertions(+), 117 deletions(-) diff --git a/engine/config/model_config.h b/engine/config/model_config.h index c7abf75f1..80a5f5df8 100644 --- a/engine/config/model_config.h +++ b/engine/config/model_config.h @@ -518,9 +518,12 @@ struct PythonModelConfig { try { YAML::Node config = YAML::LoadFile(filePath); - if (config["name"]) name = config["name"].as(); - if (config["version"]) version = config["version"].as(); - if (config["engine"]) engine = config["engine"].as(); + if (config["name"]) + name = config["name"].as(); + if (config["version"]) + version = config["version"].as(); + if (config["engine"]) + engine = config["engine"].as(); // entrypoint if (config["entrypoint"] && config["entrypoint"].IsSequence()) { @@ -529,13 +532,14 @@ struct PythonModelConfig { entrypoint.push_back(arg.as()); } } - if (config["port"]) port = config["port"].as(); - } - catch (const YAML::Exception& e) { - throw std::runtime_error("Error parsing YAML file: " + std::string(e.what())); - } - catch (const std::exception& e) { - throw std::runtime_error("Error reading YAML file: " + std::string(e.what())); + if (config["port"]) + port = config["port"].as(); + } catch (const YAML::Exception& e) { + throw std::runtime_error("Error parsing YAML file: " + + std::string(e.what())); + } catch (const std::exception& e) { + throw std::runtime_error("Error reading YAML file: " + + std::string(e.what())); } } @@ -549,11 +553,11 @@ struct PythonModelConfig { // entrypoint if (!entrypoint.empty()) { - Json::Value args(Json::arrayValue); - for (const auto& arg : entrypoint) { - args.append(arg); - } - json["entrypoint"] = args; + Json::Value args(Json::arrayValue); + for (const auto& arg : entrypoint) { + args.append(arg); + } + json["entrypoint"] = args; } json["port"] = port; @@ -583,7 +587,8 @@ struct PythonModelConfig { } catch (const Json::Exception& e) { throw std::runtime_error("Error parsing JSON: " + std::string(e.what())); } catch (const std::exception& e) { - throw std::runtime_error("Error processing JSON data: " + std::string(e.what())); + throw std::runtime_error("Error processing JSON data: " + + std::string(e.what())); } } }; diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc index 0b01d06e1..ebc8639de 100644 --- a/engine/controllers/server.cc +++ b/engine/controllers/server.cc @@ -210,17 +210,16 @@ void server::RouteRequest( } } -void server::Python( - const HttpRequestPtr& req, - std::function&& callback, - const std::string& model) { +void server::Python(const HttpRequestPtr& req, + std::function&& callback, + const std::string& model) { const std::string& full_path = req->getPath(); const std::string prefix = "/v1/python/"; if (full_path.substr(0, prefix.size()) != prefix) { auto resp = cortex_utils::CreateCortexHttpJsonResponse( - Json::Value("Invalid path: must start with " + prefix)); + Json::Value("Invalid path: must start with " + prefix)); resp->setStatusCode(k400BadRequest); callback(resp); return; @@ -232,7 +231,7 @@ void server::Python( auto port_result = inference_svc_->GetPythonPort(model); if (port_result.has_error()) { auto resp = cortex_utils::CreateCortexHttpJsonResponse( - Json::Value(port_result.error())); + Json::Value(port_result.error())); resp->setStatusCode(k400BadRequest); callback(resp); return; diff --git a/engine/controllers/server.h b/engine/controllers/server.h index 99b545d0b..e0e083213 100644 --- a/engine/controllers/server.h +++ b/engine/controllers/server.h @@ -78,10 +78,9 @@ class server : public drogon::HttpController, std::function&& callback); void RouteRequest(const HttpRequestPtr& req, std::function&& callback); - void Python( - const HttpRequestPtr& req, - std::function&& callback, - const std::string& model); + void Python(const HttpRequestPtr& req, + std::function&& callback, + const std::string& model); private: void ProcessStreamRes(std::function cb, diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h index ffde3d41b..35470f008 100644 --- a/engine/cortex-common/python_enginei.h +++ b/engine/cortex-common/python_enginei.h @@ -11,17 +11,17 @@ class PythonEngineI { // model management virtual void LoadModel( - std::shared_ptr json_body, - std::function&& callback) = 0; + std::shared_ptr json_body, + std::function&& callback) = 0; virtual void UnloadModel( - std::shared_ptr json_body, - std::function&& callback) = 0; + std::shared_ptr json_body, + std::function&& callback) = 0; virtual void GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) = 0; + std::shared_ptr json_body, + std::function&& callback) = 0; virtual void GetModels( - std::shared_ptr jsonBody, - std::function&& callback) = 0; + std::shared_ptr jsonBody, + std::function&& callback) = 0; virtual cpp::result GetPort(const std::string& model) = 0; }; diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index eb09eec98..635e35a78 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -2,10 +2,10 @@ #include #include "config/model_config.h" -#include "utils/file_manager_utils.h" -#include "utils/system_info_utils.h" #include "utils/archive_utils.h" +#include "utils/file_manager_utils.h" #include "utils/set_permission_utils.h" +#include "utils/system_info_utils.h" namespace python_engine { namespace { @@ -15,8 +15,10 @@ constexpr const int k409Conflict = 409; constexpr const int k500InternalServerError = 500; } // namespace -cpp::result DownloadUv(std::shared_ptr& download_service) { - const auto py_bin_path = file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; +cpp::result DownloadUv( + std::shared_ptr& download_service) { + const auto py_bin_path = + file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; std::filesystem::create_directories(py_bin_path); // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release? @@ -27,23 +29,30 @@ cpp::result DownloadUv(std::shared_ptr& down fname_stream << "uv-"; auto system_info = system_info_utils::GetSystemInfo(); - if (system_info->arch == "amd64") fname_stream << "x86_64"; - else if (system_info->arch == "arm64") fname_stream << "aarch64"; + if (system_info->arch == "amd64") + fname_stream << "x86_64"; + else if (system_info->arch == "arm64") + fname_stream << "aarch64"; // NOTE: there is also a musl linux version - if (system_info->os == kMacOs) fname_stream << "-apple-darwin.tar.gz"; - else if (system_info->os == kWindowsOs) fname_stream << "-pc-windows-msvc.zip"; - else if (system_info->os == kLinuxOs) fname_stream << "-unknown-linux-gnu.tar.gz"; + if (system_info->os == kMacOs) + fname_stream << "-apple-darwin.tar.gz"; + else if (system_info->os == kWindowsOs) + fname_stream << "-pc-windows-msvc.zip"; + else if (system_info->os == kLinuxOs) + fname_stream << "-unknown-linux-gnu.tar.gz"; const std::string fname = fname_stream.str(); - const std::string base_url = "https://github.com/astral-sh/uv/releases/download/"; + const std::string base_url = + "https://github.com/astral-sh/uv/releases/download/"; std::stringstream url_stream; url_stream << base_url << uv_version << "/" << fname; const std::string url = url_stream.str(); CTL_INF("Download uv from " << url); - auto on_finished = [py_bin_path, uv_version](const DownloadTask& finishedTask) { + auto on_finished = [py_bin_path, + uv_version](const DownloadTask& finishedTask) { // try to unzip the downloaded file const std::string download_path = finishedTask.items[0].localPath.string(); @@ -54,12 +63,11 @@ cpp::result DownloadUv(std::shared_ptr& down auto downloadTask = DownloadTask{.id = "uv", .type = DownloadType::Engine, - .items = { - DownloadItem{ - .id = "uv", - .downloadUrl = url, - .localPath = py_bin_path / fname, - }}}; + .items = {DownloadItem{ + .id = "uv", + .downloadUrl = url, + .localPath = py_bin_path / fname, + }}}; auto add_task_result = download_service->AddTask(downloadTask, on_finished); if (add_task_result.has_error()) { @@ -70,8 +78,8 @@ cpp::result DownloadUv(std::shared_ptr& down std::string GetUvPath() { // NOTE: do I need to add .exe for windows? - const auto path = file_manager_utils::GetCortexDataPath() - / "python_engine" / "bin" / "uv"; + const auto path = + file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; return path.string(); } bool IsUvInstalled() { @@ -91,7 +99,8 @@ PythonEngine::~PythonEngine() { // NOTE: what happens if we can't kill subprocess? std::unique_lock write_lock(mutex); for (auto& [model_name, py_proc] : model_process_map) { - if (py_proc.IsAlive()) py_proc.Kill(); + if (py_proc.IsAlive()) + py_proc.Kill(); } } @@ -109,8 +118,7 @@ static std::pair CreateResponse( if (has_error) { CTL_ERR(msg); res["error"] = msg; - } - else { + } else { res["status"] = msg; } @@ -123,7 +131,7 @@ void PythonEngine::LoadModel( if (!json_body->isMember("model") || !json_body->isMember("model_dir")) { auto [status, error] = CreateResponse( - "Missing required fields: model or model_dir", k400BadRequest); + "Missing required fields: model or model_dir", k400BadRequest); callback(std::move(status), std::move(error)); return; } @@ -137,8 +145,8 @@ void PythonEngine::LoadModel( { std::shared_lock read_lock(mutex); if (model_process_map.find(model) != model_process_map.end()) { - auto [status, error] = CreateResponse( - "Model already loaded!", k409Conflict); + auto [status, error] = + CreateResponse("Model already loaded!", k409Conflict); callback(std::move(status), std::move(error)); return; } @@ -154,9 +162,7 @@ void PythonEngine::LoadModel( } // https://docs.astral.sh/uv/reference/cli/#uv-run - std::vector command{GetUvPath(), - "run", - "--directory", + std::vector command{GetUvPath(), "run", "--directory", model_dir.string()}; for (const auto& item : py_cfg.entrypoint) command.push_back(item); @@ -165,8 +171,10 @@ void PythonEngine::LoadModel( const std::string stderr_path = (model_dir / "stderr.txt").string(); // create empty stdout.txt and stderr.txt for redirection - if (!std::filesystem::exists(stdout_path)) std::ofstream(stdout_path).flush(); - if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush(); + if (!std::filesystem::exists(stdout_path)) + std::ofstream(stdout_path).flush(); + if (!std::filesystem::exists(stderr_path)) + std::ofstream(stderr_path).flush(); // NOTE: process may start, but exits/crashes later // TODO: wait for a few seconds, then check if process is alive @@ -174,8 +182,9 @@ void PythonEngine::LoadModel( if (pid == -1) { throw std::runtime_error("Fail to spawn process with pid -1"); } - const uint64_t start_time = std::chrono::system_clock::now().time_since_epoch() / - std::chrono::milliseconds(1); + const uint64_t start_time = + std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); std::unique_lock write_lock(mutex); model_process_map[model] = {pid, py_cfg.port, start_time}; @@ -187,17 +196,17 @@ void PythonEngine::LoadModel( } auto [status, res] = CreateResponse( - "Model loaded successfully with pid: " + std::to_string(pid), - k200OK); + "Model loaded successfully with pid: " + std::to_string(pid), k200OK); callback(std::move(status), std::move(res)); } void PythonEngine::UnloadModel( - std::shared_ptr json_body, - std::function&& callback) { + std::shared_ptr json_body, + std::function&& callback) { if (!json_body->isMember("model")) { - auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest); + auto [status, error] = + CreateResponse("Missing required field: model", k400BadRequest); callback(std::move(status), std::move(error)); return; } @@ -248,11 +257,12 @@ void PythonEngine::UnloadModel( } void PythonEngine::GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) { + std::shared_ptr json_body, + std::function&& callback) { if (!json_body->isMember("model")) { - auto [status, error] = CreateResponse("Missing required field: model", k400BadRequest); + auto [status, error] = + CreateResponse("Missing required field: model", k400BadRequest); callback(std::move(status), std::move(error)); return; } @@ -296,8 +306,8 @@ void PythonEngine::GetModelStatus( } void PythonEngine::GetModels( - std::shared_ptr jsonBody, - std::function&& callback) { + std::shared_ptr jsonBody, + std::function&& callback) { Json::Value res, model_list(Json::arrayValue), status; { diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 6189da05f..904c9aa63 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -12,7 +12,8 @@ namespace python_engine { // UV-related functions -cpp::result DownloadUv(std::shared_ptr& download_service); +cpp::result DownloadUv( + std::shared_ptr& download_service); std::string GetUvPath(); bool IsUvInstalled(); @@ -35,17 +36,17 @@ class PythonEngine : public PythonEngineI { ~PythonEngine(); void LoadModel( - std::shared_ptr json_body, - std::function&& callback) override; + std::shared_ptr json_body, + std::function&& callback) override; void UnloadModel( - std::shared_ptr json_body, - std::function&& callback) override; + std::shared_ptr json_body, + std::function&& callback) override; void GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) override; + std::shared_ptr json_body, + std::function&& callback) override; void GetModels( - std::shared_ptr jsonBody, - std::function&& callback) override; + std::shared_ptr jsonBody, + std::function&& callback) override; cpp::result GetPort(const std::string& model) override; }; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 9666c93ad..bdb5ffebb 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -237,8 +237,7 @@ cpp::result EngineService::DownloadEngine( } cpp::result EngineService::DownloadLlamaCpp( - const std::string& version, - const std::optional variant_name) { + const std::string& version, const std::optional variant_name) { const std::string engine = kLlamaRepo; auto normalized_version = version == "latest" @@ -930,7 +929,9 @@ cpp::result EngineService::IsEngineReady( // Check for python engine if (engine == kPythonEngine) { if (!python_engine::IsUvInstalled()) { - return cpp::fail("Python engine is not ready. Please run `cortex engines install python`"); + return cpp::fail( + "Python engine is not ready. Please run `cortex engines install " + "python`"); } return true; } diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index 18631c279..b2cc1d7c4 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -132,7 +132,6 @@ class EngineService : public EngineServiceI { cpp::result UpdateEngine( const std::string& engine); - cpp::result, std::string> GetEngines(); cpp::result GetEngineById(int id); diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index a899eb2cf..82a162a9f 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -144,8 +144,8 @@ cpp::result InferenceService::HandleEmbedding( std::get(engine_result.value()) ->HandleEmbedding(json_body, std::move(cb)); } else if (std::holds_alternative(engine_result.value())) { - return cpp::fail(GetUnsupportedResponse( - "Python engine does not support Embedding")); + return cpp::fail( + GetUnsupportedResponse("Python engine does not support Embedding")); } else { std::get(engine_result.value()) ->HandleEmbedding(json_body, std::move(cb)); @@ -211,7 +211,8 @@ cpp::result InferenceService::HandleRouteRequest( return {}; } -cpp::result InferenceService::GetPythonPort(const std::string& model) { +cpp::result InferenceService::GetPythonPort( + const std::string& model) { auto engine_result = engine_service_->GetLoadedEngine(kPythonEngine); if (engine_result.has_error()) { return cpp::fail("Python engine is not loaded yet"); @@ -249,14 +250,11 @@ InferResult InferenceService::LoadModel( r = res; }; if (std::holds_alternative(engine)) { - std::get(engine) - ->LoadModel(json_body, std::move(cb)); + std::get(engine)->LoadModel(json_body, std::move(cb)); } else if (std::holds_alternative(engine)) { - std::get(engine) - ->LoadModel(json_body, std::move(cb)); + std::get(engine)->LoadModel(json_body, std::move(cb)); } else { - std::get(engine) - ->LoadModel(json_body, std::move(cb)); + std::get(engine)->LoadModel(json_body, std::move(cb)); } // Save model config to reload if needed auto model_id = json_body->get("model", "").asString(); @@ -289,14 +287,14 @@ InferResult InferenceService::UnloadModel(const std::string& engine_name, }; auto engine = engine_result.value(); if (std::holds_alternative(engine)) { - std::get(engine) - ->UnloadModel(std::make_shared(json_body), std::move(cb)); + std::get(engine)->UnloadModel( + std::make_shared(json_body), std::move(cb)); } else if (std::holds_alternative(engine)) { - std::get(engine) - ->UnloadModel(std::make_shared(json_body), std::move(cb)); + std::get(engine)->UnloadModel( + std::make_shared(json_body), std::move(cb)); } else { - std::get(engine) - ->UnloadModel(std::make_shared(json_body), std::move(cb)); + std::get(engine)->UnloadModel( + std::make_shared(json_body), std::move(cb)); } return std::make_pair(stt, r); @@ -331,14 +329,11 @@ InferResult InferenceService::GetModelStatus( }; auto engine = engine_result.value(); if (std::holds_alternative(engine)) { - std::get(engine) - ->GetModelStatus(json_body, std::move(cb)); + std::get(engine)->GetModelStatus(json_body, std::move(cb)); } else if (std::holds_alternative(engine)) { - std::get(engine) - ->GetModelStatus(json_body, std::move(cb)); + std::get(engine)->GetModelStatus(json_body, std::move(cb)); } else { - std::get(engine) - ->GetModelStatus(json_body, std::move(cb)); + std::get(engine)->GetModelStatus(json_body, std::move(cb)); } return std::make_pair(stt, r); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 2f4317079..93a48e72c 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -761,11 +761,13 @@ cpp::result ModelService::StartModel( // Check if Python model first if (mc.engine == kPythonEngine) { - const std::string model_yaml_path = model_entry.value().path_to_model_yaml; + const std::string model_yaml_path = + model_entry.value().path_to_model_yaml; json_data["model"] = model_handle; json_data["model_dir"] = fmu::ToAbsoluteCortexDataPath( - fs::path(model_yaml_path).parent_path()).string(); + fs::path(model_yaml_path).parent_path()) + .string(); json_data["engine"] = mc.engine; assert(!!inference_svc_); diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index fa48cdd7d..c0bd29458 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -1,11 +1,11 @@ #include "utils/process/utils.h" -#include "utils/logging_utils.h" #include +#include "utils/logging_utils.h" #if defined(_WIN32) #include #elif defined(__APPLE__) || defined(__linux__) -extern char **environ; // environment variables +extern char** environ; // environment variables #include #include #endif @@ -88,7 +88,7 @@ pid_t SpawnProcess(const std::vector& command, // redirect stdout and stderr // caller should make sure the redirect files exist. - posix_spawn_file_actions_t *action_ptr = NULL; + posix_spawn_file_actions_t* action_ptr = NULL; if (!stdout_file.empty() || !stderr_file.empty()) { posix_spawn_file_actions_t action; diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index d59e50103..f4220e4de 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -12,8 +12,8 @@ using pid_t = DWORD; #include #endif -#include #include +#include namespace cortex::process { std::string ConstructWindowsCommandLine(const std::vector& args); @@ -26,4 +26,4 @@ pid_t SpawnProcess(const std::vector& command, bool IsProcessAlive(pid_t pid); bool KillProcess(pid_t pid); -} +} // namespace cortex::process From 17688264e25f9ef5ef714018a482bf02ff9898d8 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 26 Feb 2025 09:53:09 +0800 Subject: [PATCH 32/73] run uv sync after model download --- .../extensions/python-engine/python_engine.cc | 63 ++++++++++++++++++- .../extensions/python-engine/python_engine.h | 6 ++ engine/services/model_service.cc | 10 +++ engine/utils/process/utils.cc | 11 +++- engine/utils/process/utils.h | 3 +- 5 files changed, 89 insertions(+), 4 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 635e35a78..a4cf78ba8 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -59,6 +59,22 @@ cpp::result DownloadUv( archive_utils::ExtractArchive(download_path, py_bin_path.string(), true); set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); std::filesystem::remove(download_path); + + // install Python3.10 from Astral. this will be preferred over system + // Python when possible. + // NOTE: currently this will install to a user-wide directory. we can + // install to a specific location using `--install-dir`, but later + // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use + // this Python installation. + // we can add this once we allow passing custom env var to SpawnProcess(). + // https://docs.astral.sh/uv/reference/cli/#uv-python-install + std::vector command = BuildUvCommand("python"); + command.push_back("install"); + command.push_back("3.10"); + + const pid_t pid = cortex::process::SpawnProcess(command, "", "", true); + if (pid == -1) + return cpp::fail("Fail to spawn process"); }; auto downloadTask = DownloadTask{.id = "uv", @@ -82,10 +98,53 @@ std::string GetUvPath() { file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; return path.string(); } + +// use our own cache dir so that when users delete cortexcpp/, everything is deleted. +std::string GetUvCacheDir() { + const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" / + "cache" / "uv"; + return path.string(); +} + +std::vector BuildUvCommand(const std::string& action, + const std::string& directory) { + std::vector command = {GetUvPath(), "--cache-dir", + GetUvCacheDir()}; + if (!directory.empty()) { + command.push_back("--directory"); + command.push_back(directory); + } + command.push_back(action); + return command; +} + bool IsUvInstalled() { return std::filesystem::exists(GetUvPath()); } +cpp::result UvDownloadDeps( + const std::filesystem::path& model_dir) { + if (!IsUvInstalled()) + return cpp::fail( + "uv is not installed. Please run `cortex engines install python`."); + + std::vector command = BuildUvCommand("sync", model_dir.string()); + + // script mode. 1st argument is path to .py script + if (!std::filesystem::exists(model_dir / "pyproject.toml")) { + config::PythonModelConfig py_cfg; + py_cfg.ReadFromYaml((model_dir / "model.yml").string()); + command.push_back("--script"); + command.push_back(py_cfg.entrypoint[0]); + } + + const pid_t pid = cortex::process::SpawnProcess(command, "", "", true); + if (pid == -1) + return cpp::fail("Fail to install dependencies"); + + return {}; +} + bool PythonEngine::PythonSubprocess::IsAlive() { return cortex::process::IsProcessAlive(pid); } @@ -162,8 +221,8 @@ void PythonEngine::LoadModel( } // https://docs.astral.sh/uv/reference/cli/#uv-run - std::vector command{GetUvPath(), "run", "--directory", - model_dir.string()}; + std::vector command = + BuildUvCommand("run", model_dir.string()); for (const auto& item : py_cfg.entrypoint) command.push_back(item); diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 904c9aa63..02743ac22 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -15,7 +16,12 @@ namespace python_engine { cpp::result DownloadUv( std::shared_ptr& download_service); std::string GetUvPath(); +std::string GetUvCacheDir(); +std::vector BuildUvCommand(const std::string& action, + const std::string& directory = ""); bool IsUvInstalled(); +cpp::result UvDownloadDeps( + const std::filesystem::path& yaml_path); class PythonEngine : public PythonEngineI { private: diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 93a48e72c..a6c6037d8 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -14,6 +14,7 @@ #include "services/inference_service.h" +#include "extensions/python-engine/python_engine.h" #include "utils/cli_selection_utils.h" #include "utils/engine_constants.h" #include "utils/file_manager_utils.h" @@ -507,6 +508,7 @@ ModelService::DownloadModelFromCortexsoAsync( yaml_handler.ModelConfigFromFile(model_yml_item->localPath.string()); auto mc = yaml_handler.GetModelConfig(); + // post-download hooks for different engines if (mc.engine == kLlamaEngine) { mc.model = unique_model_id; @@ -517,6 +519,14 @@ ModelService::DownloadModelFromCortexsoAsync( mc.size = model_size; yaml_handler.UpdateModelConfig(mc); yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); + + } else if (mc.engine == kPythonEngine) { + const auto model_dir = model_yml_item->localPath.parent_path(); + auto result = python_engine::UvDownloadDeps(model_dir); + if (result.has_error()) { + CTL_ERR(result.error()); + return; + } } auto rel = diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index c0bd29458..efc018ad5 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -41,7 +41,8 @@ std::vector ConvertToArgv(const std::vector& args) { pid_t SpawnProcess(const std::vector& command, const std::string stdout_file, - const std::string stderr_file) { + const std::string stderr_file, + bool wait) { try { #if defined(_WIN32) // Windows process creation @@ -73,6 +74,10 @@ pid_t SpawnProcess(const std::vector& command, // Store the process ID pid_t pid = pi.dwProcessId; + // wait for process to terminate + if (wait) + WaitForSingleObject(pi.hProcess, INFINITE); + // Close handles to avoid resource leaks CloseHandle(pi.hProcess); CloseHandle(pi.hThread); @@ -128,6 +133,10 @@ pid_t SpawnProcess(const std::vector& command, throw std::runtime_error("Failed to spawn process"); } + // wait for process to terminate + if (wait) + waitpid(pid, NULL, 0); + return pid; #else diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index f4220e4de..f8719aa68 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -22,7 +22,8 @@ std::vector ConvertToArgv(const std::vector& args); pid_t SpawnProcess(const std::vector& command, const std::string stdout_file = "", - const std::string stderr_file = ""); + const std::string stderr_file = "", + bool wait = false); bool IsProcessAlive(pid_t pid); bool KillProcess(pid_t pid); From 7627eac95612572cce15fd7ffe54d1a0de144c0f Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 26 Feb 2025 11:15:19 +0800 Subject: [PATCH 33/73] download CUDA for python engine --- engine/cli/commands/engine_install_cmd.cc | 17 +++++++++++------ .../extensions/python-engine/python_engine.cc | 4 ++-- engine/services/engine_service.cc | 4 ++-- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc index 85a5def5d..d3fdf8b9b 100644 --- a/engine/cli/commands/engine_install_cmd.cc +++ b/engine/cli/commands/engine_install_cmd.cc @@ -7,6 +7,13 @@ #include "utils/string_utils.h" namespace commands { + +// NOTE: should have a single source of truth between CLI and server +static bool NeedCudaDownload(const std::string& engine) { + return !system_info_utils::GetDriverAndCudaVersion().second.empty() && + engine != kPythonEngine; +} + bool EngineInstallCmd::Exec(const std::string& engine, const std::string& version, const std::string& src) { @@ -35,10 +42,9 @@ bool EngineInstallCmd::Exec(const std::string& engine, if (show_menu_) { DownloadProgress dp; dp.Connect(host_, port_); + bool need_cuda_download = NeedCudaDownload(engine); // engine can be small, so need to start ws first - auto dp_res = std::async(std::launch::deferred, [&dp] { - bool need_cuda_download = - !system_info_utils::GetDriverAndCudaVersion().second.empty(); + auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] { if (need_cuda_download) { return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit}); } else { @@ -148,10 +154,9 @@ bool EngineInstallCmd::Exec(const std::string& engine, // default DownloadProgress dp; dp.Connect(host_, port_); + bool need_cuda_download = NeedCudaDownload(engine); // engine can be small, so need to start ws first - auto dp_res = std::async(std::launch::deferred, [&dp] { - bool need_cuda_download = - !system_info_utils::GetDriverAndCudaVersion().second.empty(); + auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] { if (need_cuda_download) { return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit}); } else { diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index a4cf78ba8..201211abf 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -77,10 +77,10 @@ cpp::result DownloadUv( return cpp::fail("Fail to spawn process"); }; - auto downloadTask = DownloadTask{.id = "uv", + auto downloadTask = DownloadTask{.id = "python-uv", .type = DownloadType::Engine, .items = {DownloadItem{ - .id = "uv", + .id = "python-uv", .downloadUrl = url, .localPath = py_bin_path / fname, }}}; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index bdb5ffebb..ac0c9eae9 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -377,8 +377,8 @@ cpp::result EngineService::DownloadLlamaCpp( cpp::result EngineService::DownloadCuda( const std::string& engine, bool async) { - if (hw_inf_.sys_inf->os == "mac") { - // mac does not require cuda toolkit + if (hw_inf_.sys_inf->os == "mac" || engine == kPythonEngine) { + // mac and Python engine do not require cuda toolkit return true; } From 06503c00b1ba9162021e334486e1ecb89173c0bc Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 26 Feb 2025 13:39:54 +0800 Subject: [PATCH 34/73] add .exe for windows --- engine/extensions/python-engine/python_engine.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 201211abf..1a7af51a3 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -93,9 +93,10 @@ cpp::result DownloadUv( } std::string GetUvPath() { - // NOTE: do I need to add .exe for windows? + auto system_info = system_info_utils::GetSystemInfo(); + const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv"; const auto path = - file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / "uv"; + file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / bin_name; return path.string(); } From 176f8784238b5debde69dd3fa1e69520a14cb0f8 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 26 Feb 2025 16:49:01 +0800 Subject: [PATCH 35/73] destroy file action in posix --- engine/utils/process/utils.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index efc018ad5..fe336a476 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -126,8 +126,9 @@ pid_t SpawnProcess(const std::vector& command, environ // environment (inherit) ); - // NOTE: only destroy this when process ends? - // posix_spawn_file_actions_destroy(action_pr); + // NOTE: it seems like it's ok to destroy this immediately before + // subprocess terminates. + posix_spawn_file_actions_destroy(action_ptr); if (spawn_result != 0) { throw std::runtime_error("Failed to spawn process"); From f7bddc22da231c9c6b3de51791f154978167c302 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 27 Feb 2025 13:27:10 +0800 Subject: [PATCH 36/73] revert name change to avoid conflict --- engine/utils/engine_constants.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 10d19b160..8eeaa1946 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -1,7 +1,7 @@ #pragma once constexpr const auto kLlamaEngine = "llama-cpp"; -constexpr const auto kPythonEngine = "python"; +constexpr const auto kPythonEngine = "python-engine"; constexpr const auto kOpenAiEngine = "openai"; constexpr const auto kAnthropicEngine = "anthropic"; From 728e7eb02062920da4a49b07255ba2a34468a7e8 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 27 Feb 2025 15:12:40 +0800 Subject: [PATCH 37/73] check for NULL before destroy file action --- engine/utils/process/utils.cc | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index fe336a476..0ca5f8bfb 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -40,8 +40,7 @@ std::vector ConvertToArgv(const std::vector& args) { } pid_t SpawnProcess(const std::vector& command, - const std::string stdout_file, - const std::string stderr_file, + const std::string stdout_file, const std::string stderr_file, bool wait) { try { #if defined(_WIN32) @@ -102,17 +101,25 @@ pid_t SpawnProcess(const std::vector& command, if (!stdout_file.empty()) { if (std::filesystem::exists(stdout_file)) { - posix_spawn_file_actions_addopen(&action, STDOUT_FILENO, - stdout_file.data(), - O_WRONLY | O_APPEND, 0); + int rc = posix_spawn_file_actions_addopen(&action, STDOUT_FILENO, + stdout_file.data(), + O_WRONLY | O_APPEND, 0); + if (rc != 0) { + posix_spawn_file_actions_destroy(action_ptr); + throw std::runtime_error("Unable to add stdout to file action"); + } } } if (!stderr_file.empty()) { if (std::filesystem::exists(stderr_file)) { - posix_spawn_file_actions_addopen(&action, STDERR_FILENO, - stderr_file.data(), - O_WRONLY | O_APPEND, 0); + int rc = posix_spawn_file_actions_addopen(&action, STDERR_FILENO, + stderr_file.data(), + O_WRONLY | O_APPEND, 0); + if (rc != 0) { + posix_spawn_file_actions_destroy(action_ptr); + throw std::runtime_error("Unable to add stderr to file action"); + } } } } @@ -128,7 +135,9 @@ pid_t SpawnProcess(const std::vector& command, // NOTE: it seems like it's ok to destroy this immediately before // subprocess terminates. - posix_spawn_file_actions_destroy(action_ptr); + if (action_ptr != NULL) { + posix_spawn_file_actions_destroy(action_ptr); + } if (spawn_result != 0) { throw std::runtime_error("Failed to spawn process"); From 560b9fe025e6513c4a3134946a272e103c7fbd20 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 27 Feb 2025 16:51:04 +0800 Subject: [PATCH 38/73] fix windows --- engine/cli/commands/server_start_cmd.cc | 6 +- .../extensions/python-engine/python_engine.cc | 53 +++-- .../extensions/python-engine/python_engine.h | 2 +- engine/services/hardware_service.cc | 6 +- engine/utils/process/utils.cc | 190 ++++++++++++++---- engine/utils/process/utils.h | 26 ++- 6 files changed, 213 insertions(+), 70 deletions(-) diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc index c2ef779f1..a4bcb1eb5 100644 --- a/engine/cli/commands/server_start_cmd.cc +++ b/engine/cli/commands/server_start_cmd.cc @@ -119,10 +119,10 @@ bool ServerStartCmd::Exec(const std::string& host, int port, commands.push_back(get_data_folder_path()); commands.push_back("--loglevel"); commands.push_back(log_level_); - auto pid = cortex::process::SpawnProcess(commands); - if (pid < 0) { + auto result = cortex::process::SpawnProcess(commands); + if (result.has_error()) { // Fork failed - std::cerr << "Could not start server: " << std::endl; + std::cerr << "Could not start server: " << result.error() << std::endl; return false; } else { // Parent process diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 1a7af51a3..7a40545a7 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -72,9 +72,17 @@ cpp::result DownloadUv( command.push_back("install"); command.push_back("3.10"); - const pid_t pid = cortex::process::SpawnProcess(command, "", "", true); - if (pid == -1) - return cpp::fail("Fail to spawn process"); + // NOTE: errors in download callback won't be propagated to caller + auto result = cortex::process::SpawnProcess(command); + if (result.has_error()) { + CTL_ERR(result.error()); + return; + } + + if (!cortex::process::WaitProcess(result.value())) { + CTL_ERR("Process spawned but fail to wait"); + return; + } }; auto downloadTask = DownloadTask{.id = "python-uv", @@ -95,8 +103,8 @@ cpp::result DownloadUv( std::string GetUvPath() { auto system_info = system_info_utils::GetSystemInfo(); const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv"; - const auto path = - file_manager_utils::GetCortexDataPath() / "python_engine" / "bin" / bin_name; + const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" / + "bin" / bin_name; return path.string(); } @@ -139,18 +147,22 @@ cpp::result UvDownloadDeps( command.push_back(py_cfg.entrypoint[0]); } - const pid_t pid = cortex::process::SpawnProcess(command, "", "", true); - if (pid == -1) - return cpp::fail("Fail to install dependencies"); + auto result = cortex::process::SpawnProcess(command); + if (result.has_error()) + return cpp::fail("Fail to install Python dependencies. " + result.error()); + + if (!cortex::process::WaitProcess(result.value())) { + return cpp::fail("Fail to install Python dependencies."); + } return {}; } bool PythonEngine::PythonSubprocess::IsAlive() { - return cortex::process::IsProcessAlive(pid); + return cortex::process::IsProcessAlive(proc_info); } bool PythonEngine::PythonSubprocess::Kill() { - return cortex::process::KillProcess(pid); + return cortex::process::KillProcess(proc_info); } PythonEngine::PythonEngine() {} @@ -238,15 +250,22 @@ void PythonEngine::LoadModel( // NOTE: process may start, but exits/crashes later // TODO: wait for a few seconds, then check if process is alive - pid = cortex::process::SpawnProcess(command, stdout_path, stderr_path); - if (pid == -1) { - throw std::runtime_error("Fail to spawn process with pid -1"); + auto result = + cortex::process::SpawnProcess(command, stdout_path, stderr_path); + if (result.has_error()) { + throw std::runtime_error(result.error()); } - const uint64_t start_time = - std::chrono::system_clock::now().time_since_epoch() / - std::chrono::milliseconds(1); + + PythonSubprocess py_proc; + py_proc.proc_info = result.value(); + py_proc.port = py_cfg.port; + py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + + pid = py_proc.proc_info.pid; + std::unique_lock write_lock(mutex); - model_process_map[model] = {pid, py_cfg.port, start_time}; + model_process_map[model] = py_proc; } catch (const std::exception& e) { auto e_msg = e.what(); diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 02743ac22..ec7e38d72 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -26,7 +26,7 @@ cpp::result UvDownloadDeps( class PythonEngine : public PythonEngineI { private: struct PythonSubprocess { - pid_t pid; + cortex::process::ProcessInfo proc_info; int port; uint64_t start_time; diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc index 972647b51..e6bcc89ef 100644 --- a/engine/services/hardware_service.cc +++ b/engine/services/hardware_service.cc @@ -197,10 +197,10 @@ bool HardwareService::Restart(const std::string& host, int port) { commands.push_back(get_data_folder_path()); commands.push_back("--loglevel"); commands.push_back(luh::LogLevelStr(luh::global_log_level)); - auto pid = cortex::process::SpawnProcess(commands); - if (pid < 0) { + auto result = cortex::process::SpawnProcess(commands); + if (result.has_error()) { // Fork failed - std::cerr << "Could not start server: " << std::endl; + std::cerr << "Could not start server: " << result.error() << std::endl; return false; } else { // Parent process diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 0ca5f8bfb..1cc97e2c2 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -39,9 +39,9 @@ std::vector ConvertToArgv(const std::vector& args) { return argv; } -pid_t SpawnProcess(const std::vector& command, - const std::string stdout_file, const std::string stderr_file, - bool wait) { +cpp::result SpawnProcess( + const std::vector& command, const std::string& stdout_file, + const std::string& stderr_file) { try { #if defined(_WIN32) // Windows process creation @@ -49,6 +49,48 @@ pid_t SpawnProcess(const std::vector& command, PROCESS_INFORMATION pi = {0}; si.cb = sizeof(si); + HANDLE hJob = NULL, hStdOut = NULL, hStdErr = NULL; + + // redirect stdout and stderr + if (!stdout_file.empty() || !stderr_file.empty()) { + si.dwFlags |= STARTF_USESTDHANDLES; + + // when STARTF_USESTDHANDLES is set, we have to explicitly inherit + // parent's handles, otherwise subprocess may successfuly spawn but + // exit immediately. + si.hStdOutput = GetStdHandle(STD_OUTPUT_HANDLE); + si.hStdError = GetStdHandle(STD_ERROR_HANDLE); + si.hStdInput = GetStdHandle(STD_INPUT_HANDLE); + + SECURITY_ATTRIBUTES sa; + sa.nLength = sizeof(sa); + sa.lpSecurityDescriptor = NULL; + sa.bInheritHandle = TRUE; + + if (!stdout_file.empty()) { + hStdOut = CreateFileA(stdout_file.c_str(), FILE_APPEND_DATA, + FILE_SHARE_READ | FILE_SHARE_WRITE, &sa, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (hStdOut == INVALID_HANDLE_VALUE) + throw std::runtime_error("Unable to create " + stdout_file + + " to redirect stdout"); + + si.hStdOutput = hStdOut; + } + if (!stderr_file.empty()) { + hStdErr = CreateFileA(stderr_file.c_str(), FILE_APPEND_DATA, + FILE_SHARE_WRITE | FILE_SHARE_READ, &sa, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (hStdErr == INVALID_HANDLE_VALUE) { + if (hStdOut != NULL) + CloseHandle(hStdout) throw std::runtime_error( + "Unable to create " + stderr_file + " to redirect stderr"); + } + + si.hStdError = hStdErr; + } + } + // Construct command line std::string cmd_line = ConstructWindowsCommandLine(command); @@ -56,32 +98,60 @@ pid_t SpawnProcess(const std::vector& command, char command_buffer[4096]; strncpy_s(command_buffer, cmd_line.c_str(), sizeof(command_buffer)); - if (!CreateProcessA(NULL, // lpApplicationName - command_buffer, // lpCommandLine - NULL, // lpProcessAttributes - NULL, // lpThreadAttributes - FALSE, // bInheritHandles - 0, // dwCreationFlags - NULL, // lpEnvironment - NULL, // lpCurrentDirectory - &si, // lpStartupInfo - &pi // lpProcessInformation + // create a suspended process. we will resume it later after adding it to + // a job (see below) + if (!CreateProcessA(NULL, // lpApplicationName + command_buffer, // lpCommandLine + NULL, // lpProcessAttributes + NULL, // lpThreadAttributes + FALSE, // bInheritHandles + CREATE_SUSPENDED, // dwCreationFlags + NULL, // lpEnvironment + NULL, // lpCurrentDirectory + &si, // lpStartupInfo + &pi // lpProcessInformation )) { + if (hStdOut != NULL) + CloseHandle(hStdOut); + if (hStdErr != NULL) + CloseHandle(hStdErr); throw std::runtime_error("Failed to create process on Windows"); } - // Store the process ID - pid_t pid = pi.dwProcessId; + // https://devblogs.microsoft.com/oldnewthing/20131209-00/?p=2433 + // resume thread after job object assignment to make sure child processes + // will be spawned in the same job object. + HANDLE hJob = CreateJobObjectA(NULL, NULL); + std::string err_msg; + bool success = false; + if (!AssignProcessToJobObject(hJob, pi.hProcess)) { + err_msg = "Unable to assign process to job object"; + } else if (ResumeThread(pi.hThread) == (DWORD)(-1)) { + err_msg = "Unable to resume thread"; + } else { + success = true; + } - // wait for process to terminate - if (wait) - WaitForSingleObject(pi.hProcess, INFINITE); + // clean up if not successful + if (!success) { + TerminateProcess(pi.hProcess, 0); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + CloseHandle(hJob); + throw std::runtime_error(err_msg); + } // Close handles to avoid resource leaks CloseHandle(pi.hProcess); CloseHandle(pi.hThread); - return pid; + ProcessInfo proc_info; + proc_info.pid = pi.dwProcessId; + proc_info.hJob = hJob; + proc_info.hStdOut = hStdOut; + proc_info.hStdErr = hStdErr; + + return proc_info; #elif defined(__APPLE__) || defined(__linux__) // POSIX process creation @@ -143,22 +213,25 @@ pid_t SpawnProcess(const std::vector& command, throw std::runtime_error("Failed to spawn process"); } - // wait for process to terminate - if (wait) - waitpid(pid, NULL, 0); + ProcessInfo proc_info; + proc_info.pid = pid; - return pid; + return proc_info; #else #error Unsupported platform #endif } catch (const std::exception& e) { LOG_ERROR << "Process spawning error: " << e.what(); - return -1; + return cpp::fail(e.what()); } } -bool IsProcessAlive(pid_t pid) { +bool IsProcessAlive(const ProcessInfo& proc_info) { + if (proc_info.pid == PID_TERMINATED) { + return false; + } + #ifdef _WIN32 // Windows implementation HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0); @@ -171,7 +244,7 @@ bool IsProcessAlive(pid_t pid) { if (Process32First(snapshot, &processEntry)) { do { - if (processEntry.th32ProcessID == pid) { + if (processEntry.th32ProcessID == proc_info.pid) { CloseHandle(snapshot); return true; } @@ -183,13 +256,10 @@ bool IsProcessAlive(pid_t pid) { #elif defined(__APPLE__) || defined(__linux__) // Unix-like systems (Linux and macOS) implementation - if (pid <= 0) { - return false; - } // Try to send signal 0 to the process // This doesn't actually send a signal but checks if we can send signals to the process - int result = kill(pid, 0); + int result = kill(proc_info.pid, 0); if (result == 0) { return true; // Process exists and we have permission to send it signals @@ -201,20 +271,60 @@ bool IsProcessAlive(pid_t pid) { #endif } -bool KillProcess(pid_t pid) { +bool WaitProcess(ProcessInfo& proc_info) { + if (proc_info.pid == PID_TERMINATED) + return true; + #if defined(_WIN32) - HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, pid); - if (hProcess == NULL) { - LOG_ERROR << "Failed to open process"; - return false; + HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, proc_info.pid); + bool success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0; + CloseHandle(hProcess); + if (success) { + proc_info.pid = PID_TERMINATED; + CloseHandle(proc_info.hJob); + proc_info.hJob = NULL; + } + return success; +#elif defined(__APPLE__) || defined(__linux__) + bool success = waitpid(proc_info.pid, NULL, 0) == proc_info.pid; + if (success) { + proc_info.pid = PID_TERMINATED; } + return success; +#else +#error "Unsupported platform" +#endif +} - bool is_success = TerminateProcess(hProcess, 0) == TRUE; - CloseHandle(hProcess); - return is_success; +bool KillProcess(ProcessInfo& proc_info) { + if (proc_info.pid == PID_TERMINATED) + return true; + +#if defined(_WIN32) + bool success = TerminateJobObject(proc_info.hJob, 0) == 0; + // clean up resources + if (success) { + proc_info.pid = PID_TERMINATED; + CloseHandle(proc_info.hJob); + proc_info.hJob = NULL; + if (proc_info.hStdOut != NULL) { + CloseHandle(proc_info.hStdOut); + proc_info.hStdOut = NULL; + } + if (proc_info.hStdErr != NULL) { + CloseHandle(proc_info.hStdErr); + proc_info.hStdErr = NULL; + } + } + return success; #elif defined(__APPLE__) || defined(__linux__) - // NOTE: should we use SIGKILL here to be consistent with Windows? - return kill(pid, SIGTERM) == 0; + // we send SIGTERM to subprocess. we trust that this subprocess will + // propagate SIGTERM correctly to its children processes. + bool success = kill(proc_info.pid, SIGTERM) == 0; + if (success) { + proc_info.pid = PID_TERMINATED; + } + return success; #else #error "Unsupported platform" #endif diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index f8719aa68..fcc93be90 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -14,17 +14,31 @@ using pid_t = DWORD; #include #include +#include "utils/result.hpp" namespace cortex::process { + +// set pid to this value to signal that this pid should not be used. +constexpr pid_t PID_TERMINATED = 0; + +struct ProcessInfo { + pid_t pid; +#ifdef _WIN32 + // hJob is used to terminate process and its children. + // hStdOut and hStdErr must be manually closed upon process termination. + HANDLE hJob, hStdOut, hStdErr; +#endif +}; + std::string ConstructWindowsCommandLine(const std::vector& args); std::vector ConvertToArgv(const std::vector& args); -pid_t SpawnProcess(const std::vector& command, - const std::string stdout_file = "", - const std::string stderr_file = "", - bool wait = false); -bool IsProcessAlive(pid_t pid); -bool KillProcess(pid_t pid); +cpp::result SpawnProcess( + const std::vector& command, + const std::string& stdout_file = "", const std::string& stderr_file = ""); +bool IsProcessAlive(const ProcessInfo& proc_info); +bool WaitProcess(ProcessInfo& proc_info); +bool KillProcess(ProcessInfo& proc_info); } // namespace cortex::process From f481c2fc79cc7fbe931aa5ef8e6e5864fdfbad2e Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 27 Feb 2025 18:00:53 +0800 Subject: [PATCH 39/73] fix windows --- engine/utils/process/utils.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 1cc97e2c2..3b3538b50 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -83,8 +83,10 @@ cpp::result SpawnProcess( OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if (hStdErr == INVALID_HANDLE_VALUE) { if (hStdOut != NULL) - CloseHandle(hStdout) throw std::runtime_error( - "Unable to create " + stderr_file + " to redirect stderr"); + CloseHandle(hStdOut); + + throw std::runtime_error("Unable to create " + stderr_file + + " to redirect stderr"); } si.hStdError = hStdErr; @@ -121,7 +123,7 @@ cpp::result SpawnProcess( // https://devblogs.microsoft.com/oldnewthing/20131209-00/?p=2433 // resume thread after job object assignment to make sure child processes // will be spawned in the same job object. - HANDLE hJob = CreateJobObjectA(NULL, NULL); + hJob = CreateJobObjectA(NULL, NULL); std::string err_msg; bool success = false; if (!AssignProcessToJobObject(hJob, pi.hProcess)) { @@ -138,6 +140,10 @@ cpp::result SpawnProcess( CloseHandle(pi.hProcess); CloseHandle(pi.hThread); CloseHandle(hJob); + if (hStdOut != NULL) + CloseHandle(hStdOut); + if (hStdErr != NULL) + CloseHandle(hStdErr); throw std::runtime_error(err_msg); } From 48e2015b5867c9cc5bc9d4c0d1a229eb693f8998 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 27 Feb 2025 18:57:01 +0800 Subject: [PATCH 40/73] fix windows subprocess --- engine/utils/process/utils.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 3b3538b50..1bb27d4c6 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -106,7 +106,7 @@ cpp::result SpawnProcess( command_buffer, // lpCommandLine NULL, // lpProcessAttributes NULL, // lpThreadAttributes - FALSE, // bInheritHandles + TRUE, // bInheritHandles CREATE_SUSPENDED, // dwCreationFlags NULL, // lpEnvironment NULL, // lpCurrentDirectory @@ -282,7 +282,7 @@ bool WaitProcess(ProcessInfo& proc_info) { return true; #if defined(_WIN32) - HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, proc_info.pid); + HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, proc_info.pid); bool success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0; CloseHandle(hProcess); if (success) { From f02fc93e17a1edd5b6248e3e6280d5d537fd1674 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 27 Feb 2025 22:12:59 +0800 Subject: [PATCH 41/73] update test --- engine/e2e-test/api/engines/test_api_engine.py | 4 ++-- .../e2e-test/api/engines/test_api_engine_install_nightly.py | 2 +- engine/e2e-test/cli/engines/test_cli_engine_install.py | 4 ++-- .../e2e-test/cli/engines/test_cli_engine_install_nightly.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py index 61c0f72e5..f563be84d 100644 --- a/engine/e2e-test/api/engines/test_api_engine.py +++ b/engine/e2e-test/api/engines/test_api_engine.py @@ -54,12 +54,12 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self): @pytest.mark.asyncio async def test_engines_install_uninstall_python_should_be_successful(self): - response = requests.post("http://localhost:3928/v1/engines/python/install") + response = requests.post("http://localhost:3928/v1/engines/python-engine/install") assert response.status_code == 200 await wait_for_websocket_download_success_event(timeout=None) time.sleep(30) - response = requests.delete("http://localhost:3928/v1/engines/python/install") + response = requests.delete("http://localhost:3928/v1/engines/python-engine/install") assert response.status_code == 200 @pytest.mark.asyncio diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py index 22aa669ee..ca7aa0870 100644 --- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py +++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py @@ -23,7 +23,7 @@ def test_engines_install_llamacpp_should_be_successful(self): assert response.status_code == 200 def test_engines_install_python_should_be_successful(self): - response = requests.post("http://localhost:3928/v1/engines/python/install") + response = requests.post("http://localhost:3928/v1/engines/python-engine/install") assert response.status_code == 200 def test_engines_install_llamacpp_specific_version_and_variant(self): diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py index ed2359248..ca298c828 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_install.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py @@ -34,11 +34,11 @@ def test_engines_install_llamacpp_should_be_successfully(self): def test_engines_install_python_should_be_successfully(self): exit_code, output, error = run( "Install Engine", - ["engines", "install", "python"], + ["engines", "install", "python-engine"], timeout=None, capture=False, ) - response = requests.get("http://127.0.0.1:3928/v1/engines/python") + response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine") assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py index cd09c1542..b3fa6ee26 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py @@ -34,11 +34,11 @@ def test_engines_install_llamacpp_should_be_successfully(self): def test_engines_install_python_should_be_successfully(self): exit_code, output, error = run( "Install Engine", - ["engines", "install", "python"], + ["engines", "install", "python-engine"], timeout=None, capture=False, ) - response = requests.get("http://127.0.0.1:3928/v1/engines/python") + response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine") assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" From 9918672fb4abc11fa4b36340275212f68627c995 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 28 Feb 2025 09:26:53 +0800 Subject: [PATCH 42/73] more robust checks and cleanup --- .../extensions/python-engine/python_engine.cc | 47 ++++---- engine/utils/process/utils.cc | 100 +++++++++++------- engine/utils/process/utils.h | 2 +- 3 files changed, 86 insertions(+), 63 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 7a40545a7..5565da1a1 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -213,14 +213,20 @@ void PythonEngine::LoadModel( const std::string model = (*json_body)["model"].asString(); const fs::path model_dir = (*json_body)["model_dir"].asString(); - // TODO: check if model is still alive { - std::shared_lock read_lock(mutex); + std::unique_lock write_lock(mutex); if (model_process_map.find(model) != model_process_map.end()) { - auto [status, error] = - CreateResponse("Model already loaded!", k409Conflict); - callback(std::move(status), std::move(error)); - return; + // check if model is still alive + if (model_process_map[model].IsAlive()) { + auto [status, error] = + CreateResponse("Model already loaded!", k409Conflict); + callback(std::move(status), std::move(error)); + return; + } else { + // if model has exited, try to load model again + CTL_WRN("Model " << model << " has exited unexpectedly"); + model_process_map.erase(model); + } } } @@ -248,8 +254,6 @@ void PythonEngine::LoadModel( if (!std::filesystem::exists(stderr_path)) std::ofstream(stderr_path).flush(); - // NOTE: process may start, but exits/crashes later - // TODO: wait for a few seconds, then check if process is alive auto result = cortex::process::SpawnProcess(command, stdout_path, stderr_path); if (result.has_error()) { @@ -308,13 +312,12 @@ void PythonEngine::UnloadModel( std::unique_lock write_lock(mutex); // check if subprocess is still alive + // NOTE: is this step necessary? the subprocess could have terminated + // after .IsAlive() and before .Kill() later. if (!model_process_map[model].IsAlive()) { + model_process_map.erase(model); const std::string msg = "Model " + model + " stopped running."; auto [status, error] = CreateResponse(msg, k400BadRequest); - - // NOTE: do we need to do any other cleanup for subprocesses? - model_process_map.erase(model); - callback(std::move(status), std::move(error)); return; } @@ -327,7 +330,6 @@ void PythonEngine::UnloadModel( return; } - // NOTE: do we need to do any other cleanup for subprocesses? model_process_map.erase(model); } @@ -366,12 +368,10 @@ void PythonEngine::GetModelStatus( // check if subprocess is still alive if (!model_process_map[model].IsAlive()) { + CTL_WRN("Model " << model << " has exited unexpectedly."); + model_process_map.erase(model); const std::string msg = "Model " + model + " stopped running."; auto [status, error] = CreateResponse(msg, k400BadRequest); - - // NOTE: do we need to do any other cleanup for subprocesses? - model_process_map.erase(model); - callback(std::move(status), std::move(error)); return; } @@ -390,9 +390,14 @@ void PythonEngine::GetModels( Json::Value res, model_list(Json::arrayValue), status; { - std::shared_lock read_lock(mutex); - for (const auto& [model_name, py_proc] : model_process_map) { - // TODO: check if py_proc is still alive + std::unique_lock write_lock(mutex); + for (auto& [model_name, py_proc] : model_process_map) { + if (!py_proc.IsAlive()) { + CTL_WRN("Model " << model_name << " has exited unexpectedly."); + model_process_map.erase(model_name); + continue; + } + Json::Value val; val["id"] = model_name; val["engine"] = kPythonEngine; @@ -433,7 +438,7 @@ cpp::result PythonEngine::GetPort(const std::string& model) { { std::unique_lock write_lock(mutex); if (!model_process_map[model].IsAlive()) { - // NOTE: do we need to do any other cleanup for subprocesses? + CTL_WRN("Model " << model << " has exited unexpectedly."); model_process_map.erase(model); return cpp::fail("Model " + model + " stopped running."); } diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 1bb27d4c6..19b942d82 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -233,10 +233,30 @@ cpp::result SpawnProcess( } } -bool IsProcessAlive(const ProcessInfo& proc_info) { - if (proc_info.pid == PID_TERMINATED) { - return false; +static void SetProcessTerminated(ProcessInfo& proc_info) { + if (proc_info.pid == PID_TERMINATED) + return; + + proc_info.pid = PID_TERMINATED; + + // close handles on Windows +#if defined(_WIN32) + CloseHandle(proc_info.hJob); + proc_info.hJob = NULL; + if (proc_info.hStdOut != NULL) { + CloseHandle(proc_info.hStdOut); + proc_info.hStdOut = NULL; + } + if (proc_info.hStdErr != NULL) { + CloseHandle(proc_info.hStdErr); + proc_info.hStdErr = NULL; } +#endif +} + +bool IsProcessAlive(ProcessInfo& proc_info) { + if (proc_info.pid == PID_TERMINATED) + return false; #ifdef _WIN32 // Windows implementation @@ -257,21 +277,33 @@ bool IsProcessAlive(const ProcessInfo& proc_info) { } while (Process32Next(snapshot, &processEntry)); } + // pid not found in snapshot -> process has terminated. CloseHandle(snapshot); + SetProcessTerminated(proc_info); return false; #elif defined(__APPLE__) || defined(__linux__) // Unix-like systems (Linux and macOS) implementation + // NOTE: this approach only works if the process has been reaped. + // if the process has terminated but not reaped (exit status is still + // stored in the process table), kill(pid, 0) still returns 0. + // Try to send signal 0 to the process // This doesn't actually send a signal but checks if we can send signals to the process - int result = kill(proc_info.pid, 0); + // Process exists and we have permission to send it signals + // if (kill(proc_info.pid, 0) == 0) { + // return true; + // } - if (result == 0) { - return true; // Process exists and we have permission to send it signals - } + // // process exists but we don't have permission to send signal + // if (errno == EPERM) + // return true; - return errno != ESRCH; // ESRCH means "no such process" + if (waitpid(proc_info.pid, NULL, WNOHANG) == 0) + return true; + SetProcessTerminated(proc_info); + return false; #else #error "Unsupported platform" #endif @@ -281,59 +313,45 @@ bool WaitProcess(ProcessInfo& proc_info) { if (proc_info.pid == PID_TERMINATED) return true; + bool success; + #if defined(_WIN32) + // NOTE: OpenProcess() may fail if the process has terminated. HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, proc_info.pid); - bool success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0; + success = WaitForSingleObject(hProcess, INFINITE) == WAIT_OBJECT_0; CloseHandle(hProcess); - if (success) { - proc_info.pid = PID_TERMINATED; - CloseHandle(proc_info.hJob); - proc_info.hJob = NULL; - } - return success; #elif defined(__APPLE__) || defined(__linux__) - bool success = waitpid(proc_info.pid, NULL, 0) == proc_info.pid; - if (success) { - proc_info.pid = PID_TERMINATED; - } - return success; + // NOTE: waitpid() may fail if the process has terminated and the OS + // has reaped it (i.e. clear its exit status). + success = waitpid(proc_info.pid, NULL, 0) == proc_info.pid; #else #error "Unsupported platform" #endif + + if (success) + SetProcessTerminated(proc_info); + return success; } bool KillProcess(ProcessInfo& proc_info) { if (proc_info.pid == PID_TERMINATED) return true; + bool success; + #if defined(_WIN32) - bool success = TerminateJobObject(proc_info.hJob, 0) == 0; - // clean up resources - if (success) { - proc_info.pid = PID_TERMINATED; - CloseHandle(proc_info.hJob); - proc_info.hJob = NULL; - if (proc_info.hStdOut != NULL) { - CloseHandle(proc_info.hStdOut); - proc_info.hStdOut = NULL; - } - if (proc_info.hStdErr != NULL) { - CloseHandle(proc_info.hStdErr); - proc_info.hStdErr = NULL; - } - } - return success; + success = TerminateJobObject(proc_info.hJob, 0) == 0; #elif defined(__APPLE__) || defined(__linux__) // we send SIGTERM to subprocess. we trust that this subprocess will // propagate SIGTERM correctly to its children processes. - bool success = kill(proc_info.pid, SIGTERM) == 0; - if (success) { - proc_info.pid = PID_TERMINATED; - } - return success; + success = kill(proc_info.pid, SIGTERM) == 0; #else #error "Unsupported platform" #endif + + if (success) + SetProcessTerminated(proc_info); + return success; } } // namespace cortex::process diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index fcc93be90..19b821cef 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -37,7 +37,7 @@ std::vector ConvertToArgv(const std::vector& args); cpp::result SpawnProcess( const std::vector& command, const std::string& stdout_file = "", const std::string& stderr_file = ""); -bool IsProcessAlive(const ProcessInfo& proc_info); +bool IsProcessAlive(ProcessInfo& proc_info); bool WaitProcess(ProcessInfo& proc_info); bool KillProcess(ProcessInfo& proc_info); From 99a0035f0d8d171938d22a3faa474d3d870c1c08 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 3 Mar 2025 14:41:12 +0800 Subject: [PATCH 43/73] support engines uninstall --- .../extensions/python-engine/python_engine.cc | 23 +++++++------- .../extensions/python-engine/python_engine.h | 5 ++-- engine/services/engine_service.cc | 30 +++++++++++-------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 5565da1a1..4f395c821 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -15,6 +15,10 @@ constexpr const int k409Conflict = 409; constexpr const int k500InternalServerError = 500; } // namespace +std::filesystem::path GetPythonEnginePath() { + return file_manager_utils::GetCortexDataPath() / "python_engine"; +} + cpp::result DownloadUv( std::shared_ptr& download_service) { const auto py_bin_path = @@ -100,25 +104,18 @@ cpp::result DownloadUv( return {}; } -std::string GetUvPath() { +std::filesystem::path GetUvPath() { auto system_info = system_info_utils::GetSystemInfo(); const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv"; - const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" / - "bin" / bin_name; - return path.string(); -} - -// use our own cache dir so that when users delete cortexcpp/, everything is deleted. -std::string GetUvCacheDir() { - const auto path = file_manager_utils::GetCortexDataPath() / "python_engine" / - "cache" / "uv"; - return path.string(); + return GetPythonEnginePath() / "bin" / bin_name; } std::vector BuildUvCommand(const std::string& action, const std::string& directory) { - std::vector command = {GetUvPath(), "--cache-dir", - GetUvCacheDir()}; + // use our own cache dir so that when users delete cortexcpp/, everything is deleted. + const auto cache_dir = GetPythonEnginePath() / "cache" / "uv"; + std::vector command = {GetUvPath().string(), "--cache-dir", + cache_dir.string()}; if (!directory.empty()) { command.push_back("--directory"); command.push_back(directory); diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index ec7e38d72..b7d207921 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -12,11 +12,12 @@ namespace python_engine { +std::filesystem::path GetPythonEnginePath(); + // UV-related functions cpp::result DownloadUv( std::shared_ptr& download_service); -std::string GetUvPath(); -std::string GetUvCacheDir(); +std::filesystem::path GetUvPath(); std::vector BuildUvCommand(const std::string& action, const std::string& directory = ""); bool IsUvInstalled(); diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index ac0c9eae9..70b031ccb 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -191,19 +191,25 @@ cpp::result EngineService::UninstallEngineVariant( } std::optional path_to_remove = std::nullopt; - if (version == std::nullopt && variant == std::nullopt) { - // if no version and variant provided, remove all engines variant of that engine - path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne; - } else if (version != std::nullopt && variant != std::nullopt) { - // if both version and variant are provided, we only remove that variant - path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne / - variant.value() / version.value(); - } else if (version == std::nullopt) { - // if only have variant, we remove all of that variant - path_to_remove = - file_manager_utils::GetEnginesContainerPath() / ne / variant.value(); + + // Python engine is stored in a separate folder + if (ne == kPythonEngine) { + path_to_remove = python_engine::GetPythonEnginePath(); } else { - return cpp::fail("No variant provided"); + if (version == std::nullopt && variant == std::nullopt) { + // if no version and variant provided, remove all engines variant of that engine + path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne; + } else if (version != std::nullopt && variant != std::nullopt) { + // if both version and variant are provided, we only remove that variant + path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne / + variant.value() / version.value(); + } else if (version == std::nullopt) { + // if only have variant, we remove all of that variant + path_to_remove = + file_manager_utils::GetEnginesContainerPath() / ne / variant.value(); + } else { + return cpp::fail("No variant provided"); + } } if (path_to_remove == std::nullopt) { From b96fd6957f26a86c9895f8678b403552130ec8d2 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 3 Mar 2025 15:06:25 +0800 Subject: [PATCH 44/73] follow reverse proxy example --- engine/controllers/server.cc | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc index ebc8639de..ea88f5882 100644 --- a/engine/controllers/server.cc +++ b/engine/controllers/server.cc @@ -240,24 +240,27 @@ void server::Python(const HttpRequestPtr& req, // route request. localhost might not work? const int port = port_result.value(); const std::string host = "http://127.0.0.1:" + std::to_string(port); - auto client = HttpClient::newHttpClient(host); + CTL_INF("Route request to " << host << path); - auto new_req = HttpRequest::newHttpRequest(); - new_req->setMethod(req->method()); - new_req->setPath(path); - new_req->setBody(std::string{req->body()}); - new_req->setContentTypeCode(req->getContentType()); + // https://github.com/drogonframework/drogon/blob/v1.9.10/examples/simple_reverse_proxy/plugins/SimpleReverseProxy.cc + auto client = HttpClient::newHttpClient( + host, trantor::EventLoop::getEventLoopOfCurrentThread()); - // including headers may make FastAPI reqject the request... - // for (const auto& [field, value] : req->headers()) { - // new_req->addHeader(field, value); - // } + // NOTE: modify request object inplace + req->setPassThrough(true); + req->setPath(path); - CTL_INF("Route request to " << host << path); - auto cb = [callback](ReqResult result, const HttpResponsePtr& response) { - callback(response); - }; - client->sendRequest(new_req, cb); + client->sendRequest(req, [callback = std::move(callback)]( + ReqResult result, const HttpResponsePtr& resp) { + if (result == ReqResult::Ok) { + resp->setPassThrough(true); + callback(resp); + } else { + auto errResp = HttpResponse::newHttpResponse(); + errResp->setStatusCode(k500InternalServerError); + callback(errResp); + } + }); } void server::LoadModel(const HttpRequestPtr& req, @@ -277,7 +280,7 @@ void server::ProcessStreamRes(std::function cb, auto err_or_done = std::make_shared(false); auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id]( char* buf, - std::size_t buf_size) -> std::size_t { + std::size_t buf_size) -> std::size_t { if (buf == nullptr) { LOG_TRACE << "Buf is null"; if (!(*err_or_done)) { From e2e2cccf658041ceded90b8c5d0908863c98db5e Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 3 Mar 2025 21:25:54 +0800 Subject: [PATCH 45/73] update uv to 0.6.3 --- engine/extensions/python-engine/python_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index 4f395c821..d9b3ae485 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -26,7 +26,7 @@ cpp::result DownloadUv( std::filesystem::create_directories(py_bin_path); // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release? - const std::string uv_version = "0.6.2"; + const std::string uv_version = "0.6.3"; // build download url based on system info std::stringstream fname_stream; From 57c30d381a06dc1f17c36892e8d63af8c9bd458d Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 3 Mar 2025 21:26:10 +0800 Subject: [PATCH 46/73] support engines list --- engine/services/engine_service.cc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 70b031ccb..076fd02c1 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -629,6 +629,23 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const { auto ne = NormalizeEngine(engine); auto os = hw_inf_.sys_inf->os; + if (ne == kPythonEngine) { + if (!python_engine::IsUvInstalled()) { + return {}; + } else { + // Python engine only means uv is installed. + // variant name and version don't quite make sense in this context. + // hence, they are left blank. + std::vector variants; + variants.push_back(EngineVariantResponse{ + .name = "", + .version = "", + .engine = kPythonEngine, + }); + return variants; + } + } + auto engines_variants_dir = file_manager_utils::GetEnginesContainerPath() / ne; From 49df6af1afc88df8398ab684795c0521507c32ce Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 17 Mar 2025 12:00:41 +0800 Subject: [PATCH 47/73] remove checks against supportedEngines --- engine/cli/command_line_parser.cc | 94 +++++++++---------------------- engine/cli/command_line_parser.h | 6 +- 2 files changed, 29 insertions(+), 71 deletions(-) diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc index b423a6896..c2348caee 100644 --- a/engine/cli/command_line_parser.cc +++ b/engine/cli/command_line_parser.cc @@ -482,36 +482,49 @@ void CommandLineParser::SetupEngineCommands() { install_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines install [engine_name] [options]"); install_cmd->group(kSubcommands); + install_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); + install_cmd->add_option("-v, --version", cml_data_.engine_version, + "Engine version to download"); + install_cmd->add_option("-s, --source", cml_data_.engine_src, + "Install engine by local path"); + install_cmd->add_flag("-m, --menu", cml_data_.show_menu, + "Display menu for engine variant selection"); + install_cmd->callback([this, install_cmd] { if (std::exchange(executed_, true)) return; - if (install_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(install_cmd->help()); + try { + commands::EngineInstallCmd( + engine_service_, cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.show_menu) + .Exec(cml_data_.engine_name, cml_data_.engine_version, + cml_data_.engine_src); + } catch (const std::exception& e) { + CTL_ERR(e.what()); } }); - for (const auto& engine : supported_engines_) { - EngineInstall(install_cmd, engine, cml_data_.engine_version, - cml_data_.engine_src); - } - auto uninstall_cmd = engines_cmd->add_subcommand("uninstall", "Uninstall engine"); uninstall_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines uninstall [engine_name] [options]"); + uninstall_cmd->group(kSubcommands); + uninstall_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); uninstall_cmd->callback([this, uninstall_cmd] { if (std::exchange(executed_, true)) return; - if (uninstall_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(uninstall_cmd->help()); + try { + commands::EngineUninstallCmd().Exec( + cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name); + } catch (const std::exception& e) { + CTL_ERR(e.what()); } }); - uninstall_cmd->group(kSubcommands); - for (const auto& engine : supported_engines_) { - EngineUninstall(uninstall_cmd, engine); - } auto engine_upd_cmd = engines_cmd->add_subcommand("update", "Update engine"); engine_upd_cmd->usage("Usage:\n" + commands::GetCortexBinary() + @@ -726,57 +739,6 @@ void CommandLineParser::SetupSystemCommands() { }); } -void CommandLineParser::EngineInstall(CLI::App* parent, - const std::string& engine_name, - std::string& version, std::string& src) { - auto install_engine_cmd = parent->add_subcommand(engine_name, ""); - install_engine_cmd->usage("Usage:\n" + commands::GetCortexBinary() + - " engines install " + engine_name + " [options]"); - install_engine_cmd->group(kEngineGroup); - - install_engine_cmd->add_option("-v, --version", version, - "Engine version to download"); - - install_engine_cmd->add_option("-s, --source", src, - "Install engine by local path"); - - install_engine_cmd->add_flag("-m, --menu", cml_data_.show_menu, - "Display menu for engine variant selection"); - - install_engine_cmd->callback([this, engine_name, &version, &src] { - if (std::exchange(executed_, true)) - return; - try { - commands::EngineInstallCmd( - engine_service_, cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), cml_data_.show_menu) - .Exec(engine_name, version, src); - } catch (const std::exception& e) { - CTL_ERR(e.what()); - } - }); -} - -void CommandLineParser::EngineUninstall(CLI::App* parent, - const std::string& engine_name) { - auto uninstall_engine_cmd = parent->add_subcommand(engine_name, ""); - uninstall_engine_cmd->usage("Usage:\n" + commands::GetCortexBinary() + - " engines install " + engine_name + " [options]"); - uninstall_engine_cmd->group(kEngineGroup); - - uninstall_engine_cmd->callback([this, engine_name] { - if (std::exchange(executed_, true)) - return; - try { - commands::EngineUninstallCmd().Exec( - cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), engine_name); - } catch (const std::exception& e) { - CTL_ERR(e.what()); - } - }); -} - void CommandLineParser::EngineUpdate(CLI::App* parent, const std::string& engine_name) { auto engine_update_cmd = parent->add_subcommand(engine_name, ""); diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h index 5b64f7f4d..0fce8cc9b 100644 --- a/engine/cli/command_line_parser.h +++ b/engine/cli/command_line_parser.h @@ -25,11 +25,6 @@ class CommandLineParser { void SetupConfigsCommands(); - void EngineInstall(CLI::App* parent, const std::string& engine_name, - std::string& version, std::string& src); - - void EngineUninstall(CLI::App* parent, const std::string& engine_name); - void EngineUpdate(CLI::App* parent, const std::string& engine_name); void EngineGet(CLI::App* parent); @@ -54,6 +49,7 @@ class CommandLineParser { std::string msg; std::string model_alias; std::string model_path; + std::string engine_name; std::string engine_version = "latest"; std::string engine_src; std::string cortex_version; From f1dcdde8c7764b7d2cb9e15640df7a0729c2adc6 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 17 Mar 2025 12:12:10 +0800 Subject: [PATCH 48/73] remove supportedEngines check for more commands --- engine/cli/command_line_parser.cc | 196 ++++++++---------------------- engine/cli/command_line_parser.h | 11 -- 2 files changed, 54 insertions(+), 153 deletions(-) diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc index c2348caee..4afb48360 100644 --- a/engine/cli/command_line_parser.cc +++ b/engine/cli/command_line_parser.cc @@ -51,9 +51,7 @@ CommandLineParser::CommandLineParser() dylib_path_manager_{std::make_shared()}, db_service_{std::make_shared()}, engine_service_{std::make_shared( - download_service_, dylib_path_manager_, db_service_)} { - supported_engines_ = engine_service_->GetSupportedEngineNames().value(); -} + download_service_, dylib_path_manager_, db_service_)} {} bool CommandLineParser::SetupCommand(int argc, char** argv) { app_.usage("Usage:\n" + commands::GetCortexBinary() + @@ -529,70 +527,94 @@ void CommandLineParser::SetupEngineCommands() { auto engine_upd_cmd = engines_cmd->add_subcommand("update", "Update engine"); engine_upd_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines update [engine_name]"); + engine_upd_cmd->group(kSubcommands); + engine_upd_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); engine_upd_cmd->callback([this, engine_upd_cmd] { if (std::exchange(executed_, true)) return; - if (engine_upd_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(engine_upd_cmd->help()); + try { + commands::EngineUpdateCmd().Exec( + cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name); + } catch (const std::exception& e) { + CTL_ERR(e.what()); } }); - engine_upd_cmd->group(kSubcommands); - for (const auto& engine : supported_engines_) { - EngineUpdate(engine_upd_cmd, engine); - } auto engine_use_cmd = engines_cmd->add_subcommand("use", "Set engine as default"); engine_use_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines use [engine_name]"); + engine_use_cmd->group(kSubcommands); + engine_use_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); engine_use_cmd->callback([this, engine_use_cmd] { if (std::exchange(executed_, true)) return; - if (engine_use_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(engine_use_cmd->help()); + auto result = commands::EngineUseCmd().Exec( + cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name); + if (result.has_error()) { + CTL_ERR(result.error()); + } else { + CTL_INF("Engine " << cml_data_.engine_name << " is set as default"); } }); - engine_use_cmd->group(kSubcommands); - for (const auto& engine : supported_engines_) { - EngineUse(engine_use_cmd, engine); - } auto engine_load_cmd = engines_cmd->add_subcommand("load", "Load engine"); engine_load_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines load [engine_name]"); + engine_load_cmd->group(kSubcommands); + engine_load_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); engine_load_cmd->callback([this, engine_load_cmd] { if (std::exchange(executed_, true)) return; - if (engine_load_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(engine_load_cmd->help()); + auto result = commands::EngineLoadCmd().Exec( + cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name); + if (result.has_error()) { + CTL_ERR(result.error()); } }); - engine_load_cmd->group(kSubcommands); - for (const auto& engine : supported_engines_) { - EngineLoad(engine_load_cmd, engine); - } auto engine_unload_cmd = engines_cmd->add_subcommand("unload", "Unload engine"); engine_unload_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines unload [engine_name]"); + engine_unload_cmd->group(kSubcommands); + engine_unload_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); engine_unload_cmd->callback([this, engine_unload_cmd] { if (std::exchange(executed_, true)) return; - if (engine_unload_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(engine_unload_cmd->help()); + auto result = commands::EngineUnloadCmd().Exec( + cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.engine_name); + if (result.has_error()) { + CTL_ERR(result.error()); } }); - engine_unload_cmd->group(kSubcommands); - for (const auto& engine : supported_engines_) { - EngineUnload(engine_unload_cmd, engine); - } - EngineGet(engines_cmd); + auto engine_get_cmd = engines_cmd->add_subcommand("get", "Get engine info"); + engine_get_cmd->usage("Usage:\n" + commands::GetCortexBinary() + + " engines get [engine_name] [options]"); + engine_get_cmd->group(kSubcommands); + engine_get_cmd + ->add_option("name", cml_data_.engine_name, "Engine name e.g. llama-cpp") + ->required(); + engine_get_cmd->callback([this, engine_get_cmd] { + if (std::exchange(executed_, true)) + return; + commands::EngineGetCmd().Exec(cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), + cml_data_.engine_name); + }); } void CommandLineParser::SetupHardwareCommands() { @@ -739,116 +761,6 @@ void CommandLineParser::SetupSystemCommands() { }); } -void CommandLineParser::EngineUpdate(CLI::App* parent, - const std::string& engine_name) { - auto engine_update_cmd = parent->add_subcommand(engine_name, ""); - engine_update_cmd->usage("Usage:\n" + commands::GetCortexBinary() + - " engines update " + engine_name); - engine_update_cmd->group(kEngineGroup); - - engine_update_cmd->callback([this, engine_name] { - if (std::exchange(executed_, true)) - return; - try { - commands::EngineUpdateCmd().Exec( - cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), engine_name); - } catch (const std::exception& e) { - CTL_ERR(e.what()); - } - }); -} - -void CommandLineParser::EngineUnload(CLI::App* parent, - const std::string& engine_name) { - auto sub_cmd = parent->add_subcommand(engine_name, ""); - sub_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines unload " + - engine_name); - sub_cmd->group(kEngineGroup); - - sub_cmd->callback([this, engine_name] { - if (std::exchange(executed_, true)) - return; - auto result = commands::EngineUnloadCmd().Exec( - cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), engine_name); - if (result.has_error()) { - CTL_ERR(result.error()); - } - }); -} - -void CommandLineParser::EngineLoad(CLI::App* parent, - const std::string& engine_name) { - auto sub_cmd = parent->add_subcommand(engine_name, ""); - sub_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " engines load " + - engine_name); - sub_cmd->group(kEngineGroup); - - sub_cmd->callback([this, engine_name] { - if (std::exchange(executed_, true)) - return; - auto result = commands::EngineLoadCmd().Exec( - cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), engine_name); - if (result.has_error()) { - CTL_ERR(result.error()); - } - }); -} - -void CommandLineParser::EngineUse(CLI::App* parent, - const std::string& engine_name) { - auto engine_use_cmd = parent->add_subcommand(engine_name, ""); - engine_use_cmd->usage("Usage:\n" + commands::GetCortexBinary() + - " engines use " + engine_name); - engine_use_cmd->group(kEngineGroup); - - engine_use_cmd->callback([this, engine_name] { - if (std::exchange(executed_, true)) - return; - auto result = commands::EngineUseCmd().Exec( - cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), engine_name); - if (result.has_error()) { - CTL_ERR(result.error()); - } else { - CTL_INF("Engine " << engine_name << " is set as default"); - } - }); -} - -void CommandLineParser::EngineGet(CLI::App* parent) { - auto get_cmd = parent->add_subcommand("get", "Get engine info"); - get_cmd->usage("Usage:\n" + commands::GetCortexBinary() + - " engines get [engine_name] [options]"); - get_cmd->group(kSubcommands); - get_cmd->callback([this, get_cmd] { - if (std::exchange(executed_, true)) - return; - if (get_cmd->get_subcommands().empty()) { - CLI_LOG("[engine_name] is required\n"); - CLI_LOG(get_cmd->help()); - } - }); - - for (const auto& engine : supported_engines_) { - std::string desc = "Get " + engine + " status"; - - auto engine_get_cmd = get_cmd->add_subcommand(engine, desc); - engine_get_cmd->usage("Usage:\n" + commands::GetCortexBinary() + - " engines get " + engine + " [options]"); - engine_get_cmd->group(kEngineGroup); - engine_get_cmd->callback([this, engine] { - if (std::exchange(executed_, true)) - return; - commands::EngineGetCmd().Exec(cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), - engine); - }); - } -} - void CommandLineParser::ModelUpdate(CLI::App* parent) { auto model_update_cmd = parent->add_subcommand("update", "Update model configurations"); diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h index 0fce8cc9b..7a10db757 100644 --- a/engine/cli/command_line_parser.h +++ b/engine/cli/command_line_parser.h @@ -25,16 +25,6 @@ class CommandLineParser { void SetupConfigsCommands(); - void EngineUpdate(CLI::App* parent, const std::string& engine_name); - - void EngineGet(CLI::App* parent); - - void EngineUse(CLI::App* parent, const std::string& engine_name); - - void EngineLoad(CLI::App* parent, const std::string& engine_name); - - void EngineUnload(CLI::App* parent, const std::string& engine_name); - void ModelUpdate(CLI::App* parent); CLI::App app_; @@ -42,7 +32,6 @@ class CommandLineParser { std::shared_ptr dylib_path_manager_; std::shared_ptr db_service_; std::shared_ptr engine_service_; - std::vector supported_engines_; struct CmlData { std::string model_id; From 13652ca8ca77403074ff4c55488b548e6c4b55b5 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 17 Mar 2025 15:02:43 +0800 Subject: [PATCH 49/73] init vllm engine --- engine/CMakeLists.txt | 3 +- engine/cli/CMakeLists.txt | 5 +- engine/cli/commands/engine_install_cmd.cc | 2 +- .../extensions/python-engine/python_engine.cc | 447 ------------------ .../extensions/python-engine/python_engine.h | 60 --- .../extensions/python-engines/python_utils.cc | 151 ++++++ .../extensions/python-engines/python_utils.h | 33 ++ .../extensions/python-engines/vllm_engine.cc | 132 ++++++ .../extensions/python-engines/vllm_engine.h | 61 +++ engine/services/engine_service.cc | 73 ++- engine/services/model_service.cc | 8 - engine/utils/engine_constants.h | 1 + 12 files changed, 420 insertions(+), 556 deletions(-) delete mode 100644 engine/extensions/python-engine/python_engine.cc delete mode 100644 engine/extensions/python-engine/python_engine.h create mode 100644 engine/extensions/python-engines/python_utils.cc create mode 100644 engine/extensions/python-engines/python_utils.h create mode 100644 engine/extensions/python-engines/vllm_engine.cc create mode 100644 engine/extensions/python-engines/vllm_engine.h diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt index 3f08f83e0..9694db8f3 100644 --- a/engine/CMakeLists.txt +++ b/engine/CMakeLists.txt @@ -172,7 +172,8 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/file_logger.cc ${CMAKE_CURRENT_SOURCE_DIR}/extensions/template_renderer.cc - ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engine/python_engine.cc + ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engines/python_utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engines/vllm_engine.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/dylib_path_manager.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt index 0162c1f56..9dc2b4980 100644 --- a/engine/cli/CMakeLists.txt +++ b/engine/cli/CMakeLists.txt @@ -86,8 +86,9 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/../services/hardware_service.cc ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc - - ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engine/python_engine.cc + + ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engines/python_utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engines/vllm_engine.cc ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc index d3fdf8b9b..3f72a1980 100644 --- a/engine/cli/commands/engine_install_cmd.cc +++ b/engine/cli/commands/engine_install_cmd.cc @@ -11,7 +11,7 @@ namespace commands { // NOTE: should have a single source of truth between CLI and server static bool NeedCudaDownload(const std::string& engine) { return !system_info_utils::GetDriverAndCudaVersion().second.empty() && - engine != kPythonEngine; + engine == kLlamaRepo; } bool EngineInstallCmd::Exec(const std::string& engine, diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc deleted file mode 100644 index d9b3ae485..000000000 --- a/engine/extensions/python-engine/python_engine.cc +++ /dev/null @@ -1,447 +0,0 @@ -#include "python_engine.h" -#include - -#include "config/model_config.h" -#include "utils/archive_utils.h" -#include "utils/file_manager_utils.h" -#include "utils/set_permission_utils.h" -#include "utils/system_info_utils.h" - -namespace python_engine { -namespace { -constexpr const int k200OK = 200; -constexpr const int k400BadRequest = 400; -constexpr const int k409Conflict = 409; -constexpr const int k500InternalServerError = 500; -} // namespace - -std::filesystem::path GetPythonEnginePath() { - return file_manager_utils::GetCortexDataPath() / "python_engine"; -} - -cpp::result DownloadUv( - std::shared_ptr& download_service) { - const auto py_bin_path = - file_manager_utils::GetCortexDataPath() / "python_engine" / "bin"; - std::filesystem::create_directories(py_bin_path); - - // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release? - const std::string uv_version = "0.6.3"; - - // build download url based on system info - std::stringstream fname_stream; - fname_stream << "uv-"; - - auto system_info = system_info_utils::GetSystemInfo(); - if (system_info->arch == "amd64") - fname_stream << "x86_64"; - else if (system_info->arch == "arm64") - fname_stream << "aarch64"; - - // NOTE: there is also a musl linux version - if (system_info->os == kMacOs) - fname_stream << "-apple-darwin.tar.gz"; - else if (system_info->os == kWindowsOs) - fname_stream << "-pc-windows-msvc.zip"; - else if (system_info->os == kLinuxOs) - fname_stream << "-unknown-linux-gnu.tar.gz"; - - const std::string fname = fname_stream.str(); - const std::string base_url = - "https://github.com/astral-sh/uv/releases/download/"; - - std::stringstream url_stream; - url_stream << base_url << uv_version << "/" << fname; - const std::string url = url_stream.str(); - CTL_INF("Download uv from " << url); - - auto on_finished = [py_bin_path, - uv_version](const DownloadTask& finishedTask) { - // try to unzip the downloaded file - const std::string download_path = finishedTask.items[0].localPath.string(); - - archive_utils::ExtractArchive(download_path, py_bin_path.string(), true); - set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); - std::filesystem::remove(download_path); - - // install Python3.10 from Astral. this will be preferred over system - // Python when possible. - // NOTE: currently this will install to a user-wide directory. we can - // install to a specific location using `--install-dir`, but later - // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use - // this Python installation. - // we can add this once we allow passing custom env var to SpawnProcess(). - // https://docs.astral.sh/uv/reference/cli/#uv-python-install - std::vector command = BuildUvCommand("python"); - command.push_back("install"); - command.push_back("3.10"); - - // NOTE: errors in download callback won't be propagated to caller - auto result = cortex::process::SpawnProcess(command); - if (result.has_error()) { - CTL_ERR(result.error()); - return; - } - - if (!cortex::process::WaitProcess(result.value())) { - CTL_ERR("Process spawned but fail to wait"); - return; - } - }; - - auto downloadTask = DownloadTask{.id = "python-uv", - .type = DownloadType::Engine, - .items = {DownloadItem{ - .id = "python-uv", - .downloadUrl = url, - .localPath = py_bin_path / fname, - }}}; - - auto add_task_result = download_service->AddTask(downloadTask, on_finished); - if (add_task_result.has_error()) { - return cpp::fail(add_task_result.error()); - } - return {}; -} - -std::filesystem::path GetUvPath() { - auto system_info = system_info_utils::GetSystemInfo(); - const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv"; - return GetPythonEnginePath() / "bin" / bin_name; -} - -std::vector BuildUvCommand(const std::string& action, - const std::string& directory) { - // use our own cache dir so that when users delete cortexcpp/, everything is deleted. - const auto cache_dir = GetPythonEnginePath() / "cache" / "uv"; - std::vector command = {GetUvPath().string(), "--cache-dir", - cache_dir.string()}; - if (!directory.empty()) { - command.push_back("--directory"); - command.push_back(directory); - } - command.push_back(action); - return command; -} - -bool IsUvInstalled() { - return std::filesystem::exists(GetUvPath()); -} - -cpp::result UvDownloadDeps( - const std::filesystem::path& model_dir) { - if (!IsUvInstalled()) - return cpp::fail( - "uv is not installed. Please run `cortex engines install python`."); - - std::vector command = BuildUvCommand("sync", model_dir.string()); - - // script mode. 1st argument is path to .py script - if (!std::filesystem::exists(model_dir / "pyproject.toml")) { - config::PythonModelConfig py_cfg; - py_cfg.ReadFromYaml((model_dir / "model.yml").string()); - command.push_back("--script"); - command.push_back(py_cfg.entrypoint[0]); - } - - auto result = cortex::process::SpawnProcess(command); - if (result.has_error()) - return cpp::fail("Fail to install Python dependencies. " + result.error()); - - if (!cortex::process::WaitProcess(result.value())) { - return cpp::fail("Fail to install Python dependencies."); - } - - return {}; -} - -bool PythonEngine::PythonSubprocess::IsAlive() { - return cortex::process::IsProcessAlive(proc_info); -} -bool PythonEngine::PythonSubprocess::Kill() { - return cortex::process::KillProcess(proc_info); -} - -PythonEngine::PythonEngine() {} - -PythonEngine::~PythonEngine() { - // NOTE: what happens if we can't kill subprocess? - std::unique_lock write_lock(mutex); - for (auto& [model_name, py_proc] : model_process_map) { - if (py_proc.IsAlive()) - py_proc.Kill(); - } -} - -static std::pair CreateResponse( - const std::string& msg, int code) { - - Json::Value status, res; - const bool has_error = code != k200OK; - - status["is_done"] = true; - status["has_error"] = has_error; - status["is_stream"] = false; - status["status_code"] = code; - - if (has_error) { - CTL_ERR(msg); - res["error"] = msg; - } else { - res["status"] = msg; - } - - return {status, res}; -} - -void PythonEngine::LoadModel( - std::shared_ptr json_body, - std::function&& callback) { - - if (!json_body->isMember("model") || !json_body->isMember("model_dir")) { - auto [status, error] = CreateResponse( - "Missing required fields: model or model_dir", k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - - namespace fs = std::filesystem; - - const std::string model = (*json_body)["model"].asString(); - const fs::path model_dir = (*json_body)["model_dir"].asString(); - - { - std::unique_lock write_lock(mutex); - if (model_process_map.find(model) != model_process_map.end()) { - // check if model is still alive - if (model_process_map[model].IsAlive()) { - auto [status, error] = - CreateResponse("Model already loaded!", k409Conflict); - callback(std::move(status), std::move(error)); - return; - } else { - // if model has exited, try to load model again - CTL_WRN("Model " << model << " has exited unexpectedly"); - model_process_map.erase(model); - } - } - } - - pid_t pid; - try { - config::PythonModelConfig py_cfg; - py_cfg.ReadFromYaml((model_dir / "model.yml").string()); - - if (py_cfg.entrypoint.empty()) { - throw std::runtime_error("Missing entrypoint in model.yml"); - } - - // https://docs.astral.sh/uv/reference/cli/#uv-run - std::vector command = - BuildUvCommand("run", model_dir.string()); - for (const auto& item : py_cfg.entrypoint) - command.push_back(item); - - const std::string stdout_path = (model_dir / "stdout.txt").string(); - const std::string stderr_path = (model_dir / "stderr.txt").string(); - - // create empty stdout.txt and stderr.txt for redirection - if (!std::filesystem::exists(stdout_path)) - std::ofstream(stdout_path).flush(); - if (!std::filesystem::exists(stderr_path)) - std::ofstream(stderr_path).flush(); - - auto result = - cortex::process::SpawnProcess(command, stdout_path, stderr_path); - if (result.has_error()) { - throw std::runtime_error(result.error()); - } - - PythonSubprocess py_proc; - py_proc.proc_info = result.value(); - py_proc.port = py_cfg.port; - py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() / - std::chrono::milliseconds(1); - - pid = py_proc.proc_info.pid; - - std::unique_lock write_lock(mutex); - model_process_map[model] = py_proc; - - } catch (const std::exception& e) { - auto e_msg = e.what(); - auto [status, error] = CreateResponse(e_msg, k500InternalServerError); - callback(std::move(status), std::move(error)); - return; - } - - auto [status, res] = CreateResponse( - "Model loaded successfully with pid: " + std::to_string(pid), k200OK); - callback(std::move(status), std::move(res)); -} - -void PythonEngine::UnloadModel( - std::shared_ptr json_body, - std::function&& callback) { - - if (!json_body->isMember("model")) { - auto [status, error] = - CreateResponse("Missing required field: model", k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - - const std::string model = (*json_body)["model"].asString(); - - // check if model has started - { - std::shared_lock read_lock(mutex); - if (model_process_map.find(model) == model_process_map.end()) { - const std::string msg = "Model " + model + " has not been loaded yet."; - auto [status, error] = CreateResponse(msg, k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - } - - // we know that model has started - { - std::unique_lock write_lock(mutex); - - // check if subprocess is still alive - // NOTE: is this step necessary? the subprocess could have terminated - // after .IsAlive() and before .Kill() later. - if (!model_process_map[model].IsAlive()) { - model_process_map.erase(model); - const std::string msg = "Model " + model + " stopped running."; - auto [status, error] = CreateResponse(msg, k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - - // subprocess is alive. we kill it here. - if (!model_process_map[model].Kill()) { - const std::string msg = "Unable to kill process of model " + model; - auto [status, error] = CreateResponse(msg, k500InternalServerError); - callback(std::move(status), std::move(error)); - return; - } - - model_process_map.erase(model); - } - - auto [status, res] = CreateResponse("Unload model successfully", k200OK); - callback(std::move(status), std::move(res)); -} - -void PythonEngine::GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) { - - if (!json_body->isMember("model")) { - auto [status, error] = - CreateResponse("Missing required field: model", k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - - const std::string model = (*json_body)["model"].asString(); - Json::Value res, status; - - // check if model has started - { - std::shared_lock read_lock(mutex); - if (model_process_map.find(model) == model_process_map.end()) { - const std::string msg = "Model " + model + " has not been loaded yet."; - auto [status, error] = CreateResponse(msg, k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - } - - // we know that model has started - { - std::unique_lock write_lock(mutex); - - // check if subprocess is still alive - if (!model_process_map[model].IsAlive()) { - CTL_WRN("Model " << model << " has exited unexpectedly."); - model_process_map.erase(model); - const std::string msg = "Model " + model + " stopped running."; - auto [status, error] = CreateResponse(msg, k400BadRequest); - callback(std::move(status), std::move(error)); - return; - } - } - - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - callback(std::move(status), std::move(res)); -} - -void PythonEngine::GetModels( - std::shared_ptr jsonBody, - std::function&& callback) { - - Json::Value res, model_list(Json::arrayValue), status; - { - std::unique_lock write_lock(mutex); - for (auto& [model_name, py_proc] : model_process_map) { - if (!py_proc.IsAlive()) { - CTL_WRN("Model " << model_name << " has exited unexpectedly."); - model_process_map.erase(model_name); - continue; - } - - Json::Value val; - val["id"] = model_name; - val["engine"] = kPythonEngine; - val["start_time"] = py_proc.start_time; - val["port"] = py_proc.port; - val["object"] = "model"; - // TODO - // val["ram"]; - // val["vram"]; - model_list.append(val); - } - } - - res["object"] = "list"; - res["data"] = model_list; - - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = k200OK; - - callback(std::move(status), std::move(res)); -} - -cpp::result PythonEngine::GetPort(const std::string& model) { - int port; - - // check if model has started - { - std::shared_lock read_lock(mutex); - if (model_process_map.find(model) == model_process_map.end()) { - return cpp::fail("Model " + model + " has not been loaded yet."); - } - port = model_process_map[model].port; - } - - // check if subprocess is still alive - { - std::unique_lock write_lock(mutex); - if (!model_process_map[model].IsAlive()) { - CTL_WRN("Model " << model << " has exited unexpectedly."); - model_process_map.erase(model); - return cpp::fail("Model " + model + " stopped running."); - } - } - - return port; -} - -} // namespace python_engine diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h deleted file mode 100644 index b7d207921..000000000 --- a/engine/extensions/python-engine/python_engine.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -#include "cortex-common/python_enginei.h" -#include "services/download_service.h" -#include "utils/process/utils.h" - -namespace python_engine { - -std::filesystem::path GetPythonEnginePath(); - -// UV-related functions -cpp::result DownloadUv( - std::shared_ptr& download_service); -std::filesystem::path GetUvPath(); -std::vector BuildUvCommand(const std::string& action, - const std::string& directory = ""); -bool IsUvInstalled(); -cpp::result UvDownloadDeps( - const std::filesystem::path& yaml_path); - -class PythonEngine : public PythonEngineI { - private: - struct PythonSubprocess { - cortex::process::ProcessInfo proc_info; - int port; - uint64_t start_time; - - bool IsAlive(); - bool Kill(); - }; - - mutable std::shared_mutex mutex; - std::unordered_map model_process_map; - - public: - PythonEngine(); - ~PythonEngine(); - - void LoadModel( - std::shared_ptr json_body, - std::function&& callback) override; - void UnloadModel( - std::shared_ptr json_body, - std::function&& callback) override; - void GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) override; - void GetModels( - std::shared_ptr jsonBody, - std::function&& callback) override; - - cpp::result GetPort(const std::string& model) override; -}; -} // namespace python_engine diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc new file mode 100644 index 000000000..5255fcd0e --- /dev/null +++ b/engine/extensions/python-engines/python_utils.cc @@ -0,0 +1,151 @@ +#include "python_utils.h" +#include + +#include "utils/archive_utils.h" +#include "utils/file_manager_utils.h" +#include "utils/set_permission_utils.h" +#include "utils/system_info_utils.h" + +namespace python_utils { + +std::filesystem::path GetPythonEnginesPath() { + return file_manager_utils::GetCortexDataPath() / "python_engines"; +} +std::filesystem::path GetEnvsPath() { + return GetPythonEnginesPath() / "envs"; +} +std::filesystem::path GetUvPath() { + auto system_info = system_info_utils::GetSystemInfo(); + const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv"; + return GetPythonEnginesPath() / "bin" / bin_name; +} + +bool IsUvInstalled() { + return std::filesystem::exists(GetUvPath()); +} +cpp::result InstallUv( + std::shared_ptr& download_service) { + const auto py_bin_path = GetPythonEnginesPath() / "bin"; + std::filesystem::create_directories(py_bin_path); + + // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release? + const std::string uv_version = "0.6.3"; + + // build download url based on system info + std::stringstream fname_stream; + fname_stream << "uv-"; + + auto system_info = system_info_utils::GetSystemInfo(); + if (system_info->arch == "amd64") + fname_stream << "x86_64"; + else if (system_info->arch == "arm64") + fname_stream << "aarch64"; + + // NOTE: there is also a musl linux version + if (system_info->os == kMacOs) + fname_stream << "-apple-darwin.tar.gz"; + else if (system_info->os == kWindowsOs) + fname_stream << "-pc-windows-msvc.zip"; + else if (system_info->os == kLinuxOs) + fname_stream << "-unknown-linux-gnu.tar.gz"; + + const std::string fname = fname_stream.str(); + const std::string base_url = + "https://github.com/astral-sh/uv/releases/download/"; + + std::stringstream url_stream; + url_stream << base_url << uv_version << "/" << fname; + const std::string url = url_stream.str(); + CTL_INF("Download uv from " << url); + + auto on_finished = [py_bin_path, + uv_version](const DownloadTask& finishedTask) { + // try to unzip the downloaded file + const std::string download_path = finishedTask.items[0].localPath.string(); + + archive_utils::ExtractArchive(download_path, py_bin_path.string(), true); + set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); + std::filesystem::remove(download_path); + + // install Python3.10 from Astral. this will be preferred over system + // Python when possible. + // NOTE: currently this will install to a user-wide directory. we can + // install to a specific location using `--install-dir`, but later + // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use + // this Python installation. + // we can add this once we allow passing custom env var to SpawnProcess(). + // https://docs.astral.sh/uv/reference/cli/#uv-python-install + std::vector command = BuildUvCommand("python"); + command.push_back("install"); + command.push_back("3.10"); + + // NOTE: errors in download callback won't be propagated to caller + auto result = cortex::process::SpawnProcess(command); + if (result.has_error()) { + CTL_ERR(result.error()); + return; + } + + if (!cortex::process::WaitProcess(result.value())) { + CTL_ERR("Process spawned but fail to wait"); + return; + } + }; + + auto downloadTask = DownloadTask{.id = "python-uv", + .type = DownloadType::Engine, + .items = {DownloadItem{ + .id = "python-uv", + .downloadUrl = url, + .localPath = py_bin_path / fname, + }}}; + + auto add_task_result = download_service->AddTask(downloadTask, on_finished); + if (add_task_result.has_error()) { + return cpp::fail(add_task_result.error()); + } + return {}; +} + +std::vector BuildUvCommand(const std::string& action, + const std::string& directory) { + // use our own cache dir so that when users delete cortexcpp/, everything is deleted. + const auto cache_dir = GetPythonEnginesPath() / "cache" / "uv"; + std::vector command = {GetUvPath().string(), "--cache-dir", + cache_dir.string()}; + if (!directory.empty()) { + command.push_back("--directory"); + command.push_back(directory); + } + command.push_back(action); + return command; +} + +// cpp::result UvDownloadDeps( +// const std::filesystem::path& model_dir) { +// if (!IsUvInstalled()) +// return cpp::fail( +// "uv is not installed. Please run `cortex engines install python`."); + +// std::vector command = BuildUvCommand("sync", model_dir.string()); + +// // script mode. 1st argument is path to .py script +// if (!std::filesystem::exists(model_dir / "pyproject.toml")) { +// config::PythonModelConfig py_cfg; +// py_cfg.ReadFromYaml((model_dir / "model.yml").string()); +// command.push_back("--script"); +// command.push_back(py_cfg.entrypoint[0]); +// } + +// auto result = cortex::process::SpawnProcess(command); +// if (result.has_error()) +// return cpp::fail("Fail to install Python dependencies. " + result.error()); + +// if (!cortex::process::WaitProcess(result.value())) { +// return cpp::fail("Fail to install Python dependencies."); +// } + +// return {}; +// } + +} // namespace python_utils diff --git a/engine/extensions/python-engines/python_utils.h b/engine/extensions/python-engines/python_utils.h new file mode 100644 index 000000000..31b0ca0ad --- /dev/null +++ b/engine/extensions/python-engines/python_utils.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +#include "services/download_service.h" +#include "utils/process/utils.h" + +namespace python_utils { + +// paths +std::filesystem::path GetPythonEnginesPath(); +std::filesystem::path GetEnvsPath(); +std::filesystem::path GetUvPath(); + +// UV-related functions +bool IsUvInstalled(); +cpp::result InstallUv( + std::shared_ptr& download_service); +std::vector BuildUvCommand(const std::string& action, + const std::string& directory = ""); +// cpp::result UvDownloadDeps( +// const std::filesystem::path& yaml_path); + +struct PythonSubprocess { + cortex::process::ProcessInfo proc_info; + int port; + uint64_t start_time; + + bool IsAlive() { return cortex::process::IsProcessAlive(proc_info); } + bool Kill() { return cortex::process::KillProcess(proc_info); } +}; +} // namespace python_utils diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc new file mode 100644 index 000000000..f2c1d6c26 --- /dev/null +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -0,0 +1,132 @@ +#include "vllm_engine.h" +#include "utils/curl_utils.h" +#include "utils/logging_utils.h" + +namespace { +cpp::result GetLatestVllmVersion() { + auto result = curl_utils::SimpleGetJson("https://pypi.org/pypi/vllm/json"); + if (result.has_error()) + return result.error(); + + auto version_value = result.value()["info"]["version"]; + if (version_value.isNull()) + return cpp::fail("Can't find version in the response"); + + return version_value.asString(); +} +} // namespace + +VllmEngine::~VllmEngine() { + // NOTE: what happens if we can't kill subprocess? + std::unique_lock write_lock(mutex); + for (auto& [model_name, py_proc] : model_process_map) { + if (py_proc.IsAlive()) + py_proc.Kill(); + } +} + +cpp::result VllmEngine::Download( + std::shared_ptr& download_service, + const std::string& version, const std::optional variant_name) { + if (variant_name.has_value()) { + return cpp::fail("variant_name must be empty"); + } + + if (!python_utils::IsUvInstalled()) { + auto result = python_utils::InstallUv(download_service); + if (result.has_error()) + return result; + } + + std::string concrete_version = version; + if (version == "latest") { + auto result = GetLatestVllmVersion(); + if (result.has_error()) + return cpp::fail(result.error()); + + concrete_version = result.value(); + } + CTL_INF("Download vLLM " << concrete_version); + + const auto vllm_path = + python_utils::GetEnvsPath() / "vllm" / concrete_version; + std::filesystem::create_directories(vllm_path); + const auto vllm_path_str = vllm_path.string(); + + { + // initialize venv + std::vector cmd = + python_utils::BuildUvCommand("venv", vllm_path_str); + auto result = cortex::process::SpawnProcess(cmd); + if (result.has_error()) + return cpp::fail(result.error()); + + // TODO: check return code + // NOTE: these are not async + cortex::process::WaitProcess(result.value()); + } + { + // install vLLM + std::vector cmd = + python_utils::BuildUvCommand("pip", vllm_path_str); + cmd.push_back("install"); + cmd.push_back("vllm==" + concrete_version); + auto result = cortex::process::SpawnProcess(cmd); + if (result.has_error()) + return cpp::fail(result.error()); + + // TODO: check return code + // NOTE: these are not async + cortex::process::WaitProcess(result.value()); + } + + return {}; +} + +void VllmEngine::Load(EngineLoadOption opts) {}; +void VllmEngine::Unload(EngineUnloadOption opts) {}; + +// cortex.llamacpp interface +void VllmEngine::HandleChatCompletion( + std::shared_ptr json_body, + std::function&& callback) {}; +void VllmEngine::HandleEmbedding( + std::shared_ptr json_body, + std::function&& callback) {}; +void VllmEngine::LoadModel( + std::shared_ptr json_body, + std::function&& callback) {}; +void VllmEngine::UnloadModel( + std::shared_ptr json_body, + std::function&& callback) {}; +void VllmEngine::GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) {}; + +// For backward compatible checking +bool VllmEngine::IsSupported(const std::string& f) { + return true; +}; + +// Get list of running models +void VllmEngine::GetModels( + std::shared_ptr jsonBody, + std::function&& callback) {}; + +bool VllmEngine::SetFileLogger(int max_log_lines, const std::string& log_path) { + return true; +}; +void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {}; + +// Stop inflight chat completion in stream mode +void VllmEngine::StopInferencing(const std::string& model_id) {}; + +Json::Value VllmEngine::GetRemoteModels() { + return Json::Value{}; +}; +void VllmEngine::HandleRouteRequest( + std::shared_ptr json_body, + std::function&& callback) {}; +void VllmEngine::HandleInference( + std::shared_ptr json_body, + std::function&& callback) {}; diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h new file mode 100644 index 000000000..261da1025 --- /dev/null +++ b/engine/extensions/python-engines/vllm_engine.h @@ -0,0 +1,61 @@ +#include "cortex-common/EngineI.h" +#include "python_utils.h" + +class VllmEngine : public EngineI { + private: + mutable std::shared_mutex mutex; + std::unordered_map + model_process_map; + + public: + VllmEngine() {}; + ~VllmEngine(); + + static cpp::result Download( + std::shared_ptr& download_service, + const std::string& version, + const std::optional variant_name); + + virtual void Load(EngineLoadOption opts) override; + virtual void Unload(EngineUnloadOption opts) override; + + // cortex.llamacpp interface + virtual void HandleChatCompletion( + std::shared_ptr json_body, + std::function&& callback) override; + virtual void HandleEmbedding( + std::shared_ptr json_body, + std::function&& callback) override; + virtual void LoadModel( + std::shared_ptr json_body, + std::function&& callback) override; + virtual void UnloadModel( + std::shared_ptr json_body, + std::function&& callback) override; + virtual void GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) override; + + // For backward compatible checking + virtual bool IsSupported(const std::string& f) override; + + // Get list of running models + virtual void GetModels( + std::shared_ptr jsonBody, + std::function&& callback) override; + + virtual bool SetFileLogger(int max_log_lines, + const std::string& log_path) override; + virtual void SetLogLevel(trantor::Logger::LogLevel logLevel) override; + + // Stop inflight chat completion in stream mode + virtual void StopInferencing(const std::string& model_id) override; + + virtual Json::Value GetRemoteModels() override; + virtual void HandleRouteRequest( + std::shared_ptr json_body, + std::function&& callback) override; + virtual void HandleInference( + std::shared_ptr json_body, + std::function&& callback) override; +}; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index db12ea623..9056b6d5a 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -9,7 +9,7 @@ #include "config/model_config.h" #include "database/engines.h" #include "database/models.h" -#include "extensions/python-engine/python_engine.h" +#include "extensions/python-engines/vllm_engine.h" #include "extensions/remote-engine/remote_engine.h" #include "utils/archive_utils.h" @@ -18,11 +18,11 @@ #include "utils/file_manager_utils.h" #include "utils/github_release_utils.h" #include "utils/logging_utils.h" +#include "utils/normalize_engine.h" #include "utils/result.hpp" #include "utils/semantic_version_utils.h" #include "utils/system_info_utils.h" #include "utils/url_parser.h" -#include "utils/normalize_engine.h" namespace { std::string GetSuitableCudaVersion(const std::string& engine, @@ -187,7 +187,7 @@ cpp::result EngineService::UninstallEngineVariant( // Python engine is stored in a separate folder if (ne == kPythonEngine) { - path_to_remove = python_engine::GetPythonEnginePath(); + return cpp::fail("Not implemented"); } else { if (version == std::nullopt && variant == std::nullopt) { // if no version and variant provided, remove all engines variant of that engine @@ -228,9 +228,8 @@ cpp::result EngineService::DownloadEngine( if (engine == kLlamaRepo) { return DownloadLlamaCpp(version, variant_name); - } else if (engine == kPythonEngine) { - // ignore version and variant_name - return python_engine::DownloadUv(download_service_); + } else if (engine == kVllmEngine) { + return VllmEngine::Download(download_service_, version, variant_name); } return cpp::fail("Unknown engine " + engine); } @@ -376,8 +375,8 @@ cpp::result EngineService::DownloadLlamaCpp( cpp::result EngineService::DownloadCuda( const std::string& engine, bool async) { - if (hw_inf_.sys_inf->os == "mac" || engine == kPythonEngine) { - // mac and Python engine do not require cuda toolkit + if (hw_inf_.sys_inf->os == "mac" || engine != kLlamaRepo) { + // mac and non-llama.cpp engine do not require cuda toolkit return true; } @@ -622,22 +621,22 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const { auto ne = cortex::engine::NormalizeEngine(engine); auto os = hw_inf_.sys_inf->os; - if (ne == kPythonEngine) { - if (!python_engine::IsUvInstalled()) { - return {}; - } else { - // Python engine only means uv is installed. - // variant name and version don't quite make sense in this context. - // hence, they are left blank. - std::vector variants; - variants.push_back(EngineVariantResponse{ - .name = "", - .version = "", - .engine = kPythonEngine, - }); - return variants; - } - } + // if (ne == kPythonEngine) { + // if (!python_engine::IsUvInstalled()) { + // return {}; + // } else { + // // Python engine only means uv is installed. + // // variant name and version don't quite make sense in this context. + // // hence, they are left blank. + // std::vector variants; + // variants.push_back(EngineVariantResponse{ + // .name = "", + // .version = "", + // .engine = kPythonEngine, + // }); + // return variants; + // } + // } auto engines_variants_dir = file_manager_utils::GetEnginesContainerPath() / ne; @@ -705,11 +704,11 @@ cpp::result EngineService::LoadEngine( // Check for python engine - if (engine_name == kPythonEngine) { - engines_[engine_name].engine = new python_engine::PythonEngine(); - CTL_INF("Loaded engine: " << engine_name); - return {}; - } + // if (engine_name == kPythonEngine) { + // engines_[engine_name].engine = new python_engine::PythonEngine(); + // CTL_INF("Loaded engine: " << engine_name); + // return {}; + // } // Check for remote engine if (IsRemoteEngine(engine_name)) { @@ -943,14 +942,14 @@ cpp::result EngineService::IsEngineReady( } // Check for python engine - if (engine == kPythonEngine) { - if (!python_engine::IsUvInstalled()) { - return cpp::fail( - "Python engine is not ready. Please run `cortex engines install " - "python`"); - } - return true; - } + // if (engine == kPythonEngine) { + // if (!python_engine::IsUvInstalled()) { + // return cpp::fail( + // "Python engine is not ready. Please run `cortex engines install " + // "python`"); + // } + // return true; + // } auto os = hw_inf_.sys_inf->os; diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 42a08f3b8..0b84f2b0c 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -14,7 +14,6 @@ #include "services/inference_service.h" -#include "extensions/python-engine/python_engine.h" #include "utils/cli_selection_utils.h" #include "utils/engine_constants.h" #include "utils/file_manager_utils.h" @@ -544,13 +543,6 @@ ModelService::DownloadModelFromCortexsoAsync( yaml_handler.UpdateModelConfig(mc); yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); - } else if (mc.engine == kPythonEngine) { - const auto model_dir = model_yml_item->localPath.parent_path(); - auto result = python_engine::UvDownloadDeps(model_dir); - if (result.has_error()) { - CTL_ERR(result.error()); - return; - } } auto rel = diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 8eeaa1946..3e1686e2f 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -2,6 +2,7 @@ constexpr const auto kLlamaEngine = "llama-cpp"; constexpr const auto kPythonEngine = "python-engine"; +constexpr const auto kVllmEngine = "vllm"; constexpr const auto kOpenAiEngine = "openai"; constexpr const auto kAnthropicEngine = "anthropic"; From 4d13014eb0921cedc9e4f8c53abc01c757157b49 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Mon, 17 Mar 2025 15:26:45 +0800 Subject: [PATCH 50/73] fix issues with progress streaming --- engine/cli/commands/engine_install_cmd.cc | 28 ++++++++++++------- .../extensions/python-engines/vllm_engine.cc | 13 +++++++-- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc index 3f72a1980..bb1c7cec7 100644 --- a/engine/cli/commands/engine_install_cmd.cc +++ b/engine/cli/commands/engine_install_cmd.cc @@ -44,12 +44,16 @@ bool EngineInstallCmd::Exec(const std::string& engine, dp.Connect(host_, port_); bool need_cuda_download = NeedCudaDownload(engine); // engine can be small, so need to start ws first - auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] { - if (need_cuda_download) { + auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download, engine] { + // if (need_cuda_download) { + // return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit}); + // } else { + // return dp.Handle({DownloadType::Engine}); + // } + if (engine == kLlamaRepo) return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit}); - } else { - return dp.Handle({DownloadType::Engine}); - } + else + return dp.Handle({}); }); auto releases_url = url_parser::Url{ @@ -156,12 +160,16 @@ bool EngineInstallCmd::Exec(const std::string& engine, dp.Connect(host_, port_); bool need_cuda_download = NeedCudaDownload(engine); // engine can be small, so need to start ws first - auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download] { - if (need_cuda_download) { + auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download, engine] { + // if (need_cuda_download) { + // return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit}); + // } else { + // return dp.Handle({DownloadType::Engine}); + // } + if (engine == kLlamaRepo) return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit}); - } else { - return dp.Handle({DownloadType::Engine}); - } + else + return dp.Handle({}); }); auto install_url = url_parser::Url{ diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index f2c1d6c26..b549f6c78 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -53,8 +53,8 @@ cpp::result VllmEngine::Download( std::filesystem::create_directories(vllm_path); const auto vllm_path_str = vllm_path.string(); - { - // initialize venv + // initialize venv + if (!std::filesystem::exists(vllm_path / ".venv")) { std::vector cmd = python_utils::BuildUvCommand("venv", vllm_path_str); auto result = cortex::process::SpawnProcess(cmd); @@ -65,8 +65,9 @@ cpp::result VllmEngine::Download( // NOTE: these are not async cortex::process::WaitProcess(result.value()); } + + // install vLLM { - // install vLLM std::vector cmd = python_utils::BuildUvCommand("pip", vllm_path_str); cmd.push_back("install"); @@ -90,15 +91,19 @@ void VllmEngine::Unload(EngineUnloadOption opts) {}; void VllmEngine::HandleChatCompletion( std::shared_ptr json_body, std::function&& callback) {}; + void VllmEngine::HandleEmbedding( std::shared_ptr json_body, std::function&& callback) {}; + void VllmEngine::LoadModel( std::shared_ptr json_body, std::function&& callback) {}; + void VllmEngine::UnloadModel( std::shared_ptr json_body, std::function&& callback) {}; + void VllmEngine::GetModelStatus( std::shared_ptr json_body, std::function&& callback) {}; @@ -124,9 +129,11 @@ void VllmEngine::StopInferencing(const std::string& model_id) {}; Json::Value VllmEngine::GetRemoteModels() { return Json::Value{}; }; + void VllmEngine::HandleRouteRequest( std::shared_ptr json_body, std::function&& callback) {}; + void VllmEngine::HandleInference( std::shared_ptr json_body, std::function&& callback) {}; From 591d4611a0884c2e5ba1226da941f475d2d82995 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 18 Mar 2025 13:06:41 +0800 Subject: [PATCH 51/73] support download HF model --- engine/cli/commands/model_pull_cmd.cc | 9 +- engine/controllers/models.cc | 39 +----- engine/services/model_service.cc | 178 ++++++++++++++++++++------ engine/services/model_service.h | 25 ++-- engine/utils/huggingface_utils.h | 6 + 5 files changed, 173 insertions(+), 84 deletions(-) diff --git a/engine/cli/commands/model_pull_cmd.cc b/engine/cli/commands/model_pull_cmd.cc index 75c0ce1a0..edd11b399 100644 --- a/engine/cli/commands/model_pull_cmd.cc +++ b/engine/cli/commands/model_pull_cmd.cc @@ -65,9 +65,14 @@ std::optional ModelPullCmd::Exec(const std::string& host, int port, } auto download_url = res.value()["downloadUrl"].asString(); + // TODO: when will these 2 be empty? if (downloaded.empty() && avails.empty()) { - model_id = id; - model = download_url; + if (res.value()["modelSource"].asString() == "huggingface") { + model = "hf:" + id; + } else { + model_id = id; + model = download_url; + } } else { if (is_cortexso) { auto selection = cli_selection_utils::PrintModelSelection( diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc index a4a218143..0c1041abc 100644 --- a/engine/controllers/models.cc +++ b/engine/controllers/models.cc @@ -27,7 +27,7 @@ void Models::PullModel(const HttpRequestPtr& req, return; } - auto model_handle = (*(req->getJsonObject())).get("model", "").asString(); + auto model_handle = req->getJsonObject()->get("model", "").asString(); if (model_handle.empty()) { Json::Value ret; ret["result"] = "Bad Request"; @@ -38,48 +38,19 @@ void Models::PullModel(const HttpRequestPtr& req, } std::optional desired_model_id = std::nullopt; - auto id = (*(req->getJsonObject())).get("id", "").asString(); + auto id = req->getJsonObject()->get("id", "").asString(); if (!id.empty()) { desired_model_id = id; } std::optional desired_model_name = std::nullopt; - auto name_value = (*(req->getJsonObject())).get("name", "").asString(); - + auto name_value = req->getJsonObject()->get("name", "").asString(); if (!name_value.empty()) { desired_model_name = name_value; } - auto handle_model_input = - [&, model_handle]() -> cpp::result { - CTL_INF("Handle model input, model handle: " + model_handle); - if (string_utils::StartsWith(model_handle, "https")) { - return model_service_->HandleDownloadUrlAsync( - model_handle, desired_model_id, desired_model_name); - } else if (model_handle.find(":") != std::string::npos) { - auto model_and_branch = string_utils::SplitBy(model_handle, ":"); - if (model_and_branch.size() == 3) { - auto mh = url_parser::Url{ - .protocol = "https", - .host = kHuggingFaceHost, - .pathParams = { - model_and_branch[0], - model_and_branch[1], - "resolve", - "main", - model_and_branch[2], - }}.ToFullPath(); - return model_service_->HandleDownloadUrlAsync(mh, desired_model_id, - desired_model_name); - } - return model_service_->DownloadModelFromCortexsoAsync( - model_and_branch[0], model_and_branch[1], desired_model_id); - } - - return cpp::fail("Invalid model handle or not supported!"); - }; - - auto result = handle_model_input(); + auto result = model_service_->PullModel(model_handle, desired_model_id, + desired_model_name); if (result.has_error()) { Json::Value ret; ret["message"] = result.error(); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index e4fd44352..66f59ed7b 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -97,12 +97,14 @@ void ParseGguf(DatabaseService& db_service, } } -cpp::result GetDownloadTask( - const std::string& modelId, const std::string& branch = "main") { +cpp::result GetCloneRepoDownloadTask( + const std::string& author_id, const std::string& modelId, + const std::string& branch, const std::string& save_dir, + const std::string& task_id) { url_parser::Url url = { .protocol = "https", .host = kHuggingFaceHost, - .pathParams = {"api", "models", "cortexso", modelId, "tree", branch}, + .pathParams = {"api", "models", author_id, modelId, "tree", branch}, }; auto result = curl_utils::SimpleGetJsonRecursive(url.ToFullPath()); @@ -112,7 +114,7 @@ cpp::result GetDownloadTask( std::vector download_items{}; auto model_container_path = file_manager_utils::GetModelsContainerPath() / - "cortex.so" / modelId / branch; + save_dir / modelId / branch; file_manager_utils::CreateDirectoryRecursively(model_container_path.string()); for (const auto& value : result.value()) { @@ -125,7 +127,7 @@ cpp::result GetDownloadTask( url_parser::Url download_url = { .protocol = "https", .host = kHuggingFaceHost, - .pathParams = {"cortexso", modelId, "resolve", branch, path}}; + .pathParams = {author_id, modelId, "resolve", branch, path}}; auto local_path = model_container_path / path; if (!std::filesystem::exists(local_path.parent_path())) { @@ -137,9 +139,8 @@ cpp::result GetDownloadTask( .localPath = local_path}); } - return DownloadTask{.id = branch == "main" ? modelId : modelId + "-" + branch, - .type = DownloadType::Model, - .items = download_items}; + return DownloadTask{ + .id = task_id, .type = DownloadType::Model, .items = download_items}; } } // namespace @@ -298,6 +299,55 @@ cpp::result ModelService::HandleDownloadUrlAsync( return download_service_->AddTask(downloadTask, on_finished); } +cpp::result ModelService::DownloadHfModelAsync( + const std::string& author_id, const std::string& model_id, + std::optional temp_model_id) { + + const std::string unique_model_id = + temp_model_id.value_or(author_id + ":" + model_id); + auto model_entry = db_service_->GetModelInfo(unique_model_id); + if (model_entry.has_value() && + model_entry->status == cortex::db::ModelStatus::Downloaded) + return cpp::fail("Please delete the model before downloading again"); + + const std::string branch = "main"; + auto download_task = GetCloneRepoDownloadTask(author_id, model_id, branch, + author_id, unique_model_id); + if (download_task.has_error()) + return download_task; + + auto on_finished = [&, this](const DownloadTask& finishedTask) { + if (!db_service_->HasModel(unique_model_id)) { + cortex::db::ModelEntry model_entry{ + .model = unique_model_id, + .author_repo_id = author_id, + .branch_name = branch, + .path_to_model_yaml = "", + .model_alias = unique_model_id, + .status = cortex::db::ModelStatus::Downloaded, + .engine = kVllmEngine}; + + auto result = db_service_->AddModelEntry(model_entry); + if (result.has_error()) { + CTL_ERR("Error adding model to modellist: " + result.error()); + } + } else { + if (auto m = db_service_->GetModelInfo(unique_model_id); m.has_value()) { + auto upd_m = m.value(); + upd_m.status = cortex::db::ModelStatus::Downloaded; + if (auto r = db_service_->UpdateModelEntry(unique_model_id, upd_m); + r.has_error()) { + CTL_ERR(r.error()); + } + } else { + CTL_WRN("Could not get model entry with model id: " << unique_model_id); + } + } + }; + + return download_service_->AddTask(download_task.value(), on_finished); +} + std::optional ModelService::GetEstimation( const std::string& model_handle) { std::lock_guard l(es_mtx_); @@ -363,24 +413,19 @@ ModelService::DownloadModelFromCortexsoAsync( const std::string& name, const std::string& branch, std::optional temp_model_id) { - auto download_task = GetDownloadTask(name, branch); - if (download_task.has_error()) { - return cpp::fail(download_task.error()); - } - - std::string unique_model_id = ""; - if (temp_model_id.has_value()) { - unique_model_id = temp_model_id.value(); - } else { - unique_model_id = name + ":" + branch; - } - + std::string unique_model_id = temp_model_id.value_or(name + ":" + branch); auto model_entry = db_service_->GetModelInfo(unique_model_id); if (model_entry.has_value() && model_entry->status == cortex::db::ModelStatus::Downloaded) { return cpp::fail("Please delete the model before downloading again"); } + auto download_task = GetCloneRepoDownloadTask("cortexso", name, branch, + "cortex.so", unique_model_id); + if (download_task.has_error()) { + return cpp::fail(download_task.error()); + } + auto on_finished = [this, unique_model_id, branch](const DownloadTask& finishedTask) { const DownloadItem* model_yml_item = nullptr; @@ -415,7 +460,6 @@ ModelService::DownloadModelFromCortexsoAsync( mc.size = model_size; yaml_handler.UpdateModelConfig(mc); yaml_handler.WriteYamlFile(model_yml_item->localPath.string()); - } auto rel = @@ -451,9 +495,7 @@ ModelService::DownloadModelFromCortexsoAsync( } }; - auto task = download_task.value(); - task.id = unique_model_id; - return download_service_->AddTask(task, on_finished); + return download_service_->AddTask(download_task.value(), on_finished); } cpp::result ModelService::DeleteModel( @@ -862,28 +904,38 @@ cpp::result ModelService::GetModelPullInfo( huggingface_utils::GetHuggingFaceModelRepoInfo(author, model_name); if (!repo_info.has_value()) { - return cpp::fail("Model not found"); + return cpp::fail("Model not found on " + std::string{kHuggingFaceHost}); } - if (!repo_info->gguf.has_value()) { - return cpp::fail( - "Not a GGUF model. Currently, only GGUF single file is " - "supported."); + // repo containing GGUF files + if (repo_info->gguf.has_value()) { + std::vector options{}; + for (const auto& sibling : repo_info->siblings) { + if (string_utils::EndsWith(sibling.rfilename, ".gguf")) { + options.push_back(sibling.rfilename); + } + } + + return ModelPullInfo{ + .id = author + ":" + model_name, + .downloaded_models = {}, + .available_models = options, + .download_url = + huggingface_utils::GetDownloadableUrl(author, model_name, "")}; } - std::vector options{}; - for (const auto& sibling : repo_info->siblings) { - if (string_utils::EndsWith(sibling.rfilename, ".gguf")) { - options.push_back(sibling.rfilename); - } + // repo that is supported by HF transformers + // we will download the whole repo + if (repo_info->library_name.value_or("") == "transformers") { + return ModelPullInfo{ + .id = author + ":" + model_name, + .model_source = "huggingface", + }; } - return ModelPullInfo{ - .id = author + ":" + model_name, - .downloaded_models = {}, - .available_models = options, - .download_url = - huggingface_utils::GetDownloadableUrl(author, model_name, "")}; + return cpp::fail( + "Unsupported model. Currently, only GGUF models and HF models are " + "supported."); } } auto branches = @@ -929,6 +981,52 @@ cpp::result ModelService::GetModelPullInfo( .model_source = "cortexso"}; } +cpp::result ModelService::PullModel( + const std::string& model_handle, + const std::optional& desired_model_id, + const std::optional& desired_model_name) { + CTL_INF("Handle model input, model handle: " + model_handle); + + if (string_utils::StartsWith(model_handle, "https")) + return HandleDownloadUrlAsync(model_handle, desired_model_id, + desired_model_name); + + if (model_handle.find(":") == std::string::npos) + return cpp::fail("Invalid model handle or not supported!"); + + auto model_and_branch = string_utils::SplitBy(model_handle, ":"); + + // cortexso format - model:branch + if (model_and_branch.size() == 2) + return DownloadModelFromCortexsoAsync( + model_and_branch[0], model_and_branch[1], desired_model_id); + + if (model_and_branch.size() == 3) { + // HF model + // hf:author_id:model_name + // NOTE: this may confuse with the format below, where author_id = "hf" + // https://huggingface.co/hf + if (model_and_branch[0] == "hf") + return DownloadHfModelAsync(model_and_branch[1], model_and_branch[2]); + + // single GGUF file + // author_id:model_name:filename + auto mh = url_parser::Url{ + .protocol = "https", + .host = kHuggingFaceHost, + .pathParams = { + model_and_branch[0], + model_and_branch[1], + "resolve", + "main", + model_and_branch[2], + }}.ToFullPath(); + return HandleDownloadUrlAsync(mh, desired_model_id, desired_model_name); + } + + return cpp::fail("Invalid model handle or not supported!"); +} + cpp::result ModelService::AbortDownloadModel( const std::string& task_id) { return download_service_->StopTask(task_id); diff --git a/engine/services/model_service.h b/engine/services/model_service.h index beba91f8c..3a8e32963 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -39,13 +39,14 @@ class ModelService { std::shared_ptr engine_svc, cortex::TaskQueue& task_queue); + cpp::result PullModel( + const std::string& model_handle, + const std::optional& desired_model_id, + const std::optional& desired_model_name); + cpp::result AbortDownloadModel( const std::string& task_id); - cpp::result DownloadModelFromCortexsoAsync( - const std::string& name, const std::string& branch = "main", - std::optional temp_model_id = std::nullopt); - std::optional GetDownloadedModel( const std::string& modelId) const; @@ -67,10 +68,6 @@ class ModelService { cpp::result GetModelPullInfo( const std::string& model_handle); - cpp::result HandleDownloadUrlAsync( - const std::string& url, std::optional temp_model_id, - std::optional temp_name); - bool HasModel(const std::string& id) const; std::optional GetEstimation( @@ -89,6 +86,18 @@ class ModelService { std::string GetEngineByModelId(const std::string& model_id) const; private: + cpp::result HandleDownloadUrlAsync( + const std::string& url, std::optional temp_model_id, + std::optional temp_name); + + cpp::result DownloadModelFromCortexsoAsync( + const std::string& name, const std::string& branch = "main", + std::optional temp_model_id = std::nullopt); + + cpp::result DownloadHfModelAsync( + const std::string& author_id, const std::string& model_id, + std::optional temp_model_id = std::nullopt); + cpp::result, std::string> MayFallbackToCpu( const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048, int n_ubatch = 2048, const std::string& kv_cache_type = "f16"); diff --git a/engine/utils/huggingface_utils.h b/engine/utils/huggingface_utils.h index 14c19084a..f98891b71 100644 --- a/engine/utils/huggingface_utils.h +++ b/engine/utils/huggingface_utils.h @@ -165,6 +165,7 @@ struct HuggingFaceModelRepoInfo { int downloads; int likes; + std::optional library_name; std::optional gguf; std::vector siblings; std::vector spaces; @@ -173,6 +174,10 @@ struct HuggingFaceModelRepoInfo { static cpp::result FromJson( const Json::Value& body) { + std::optional library_name = std::nullopt; + if (body["library_name"]) + library_name = body["library_name"].asString(); + std::optional gguf = std::nullopt; auto gguf_result = HuggingFaceGgufInfo::FromJson(body["gguf"]); if (gguf_result.has_value()) { @@ -202,6 +207,7 @@ struct HuggingFaceModelRepoInfo { .downloads = body["downloads"].asInt(), .likes = body["likes"].asInt(), + .library_name = library_name, .gguf = gguf, .siblings = siblings, .spaces = From c3d41bf893946b17ccb24319bb21e4e818f5e8f5 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 18 Mar 2025 18:11:04 +0800 Subject: [PATCH 52/73] use / for HF model --- engine/cli/commands/model_pull_cmd.cc | 3 +- engine/services/model_service.cc | 46 +++++++++++++++------------ engine/services/model_service.h | 3 +- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/engine/cli/commands/model_pull_cmd.cc b/engine/cli/commands/model_pull_cmd.cc index edd11b399..5e7ce045b 100644 --- a/engine/cli/commands/model_pull_cmd.cc +++ b/engine/cli/commands/model_pull_cmd.cc @@ -65,10 +65,9 @@ std::optional ModelPullCmd::Exec(const std::string& host, int port, } auto download_url = res.value()["downloadUrl"].asString(); - // TODO: when will these 2 be empty? if (downloaded.empty() && avails.empty()) { if (res.value()["modelSource"].asString() == "huggingface") { - model = "hf:" + id; + model = id; } else { model_id = id; model = download_url; diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 66f59ed7b..1751e52b2 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -99,7 +99,7 @@ void ParseGguf(DatabaseService& db_service, cpp::result GetCloneRepoDownloadTask( const std::string& author_id, const std::string& modelId, - const std::string& branch, const std::string& save_dir, + const std::string& branch, const std::vector& save_dir, const std::string& task_id) { url_parser::Url url = { .protocol = "https", @@ -113,8 +113,9 @@ cpp::result GetCloneRepoDownloadTask( } std::vector download_items{}; - auto model_container_path = file_manager_utils::GetModelsContainerPath() / - save_dir / modelId / branch; + auto model_container_path = file_manager_utils::GetModelsContainerPath(); + for (auto subdir : save_dir) + model_container_path /= subdir; file_manager_utils::CreateDirectoryRecursively(model_container_path.string()); for (const auto& value : result.value()) { @@ -300,19 +301,18 @@ cpp::result ModelService::HandleDownloadUrlAsync( } cpp::result ModelService::DownloadHfModelAsync( - const std::string& author_id, const std::string& model_id, - std::optional temp_model_id) { + const std::string& author_id, const std::string& model_id) { - const std::string unique_model_id = - temp_model_id.value_or(author_id + ":" + model_id); + const std::string unique_model_id = author_id + "/" + model_id; auto model_entry = db_service_->GetModelInfo(unique_model_id); if (model_entry.has_value() && model_entry->status == cortex::db::ModelStatus::Downloaded) return cpp::fail("Please delete the model before downloading again"); const std::string branch = "main"; - auto download_task = GetCloneRepoDownloadTask(author_id, model_id, branch, - author_id, unique_model_id); + auto download_task = GetCloneRepoDownloadTask( + author_id, model_id, branch, {"huggingface.co", author_id, model_id}, + unique_model_id); if (download_task.has_error()) return download_task; @@ -410,18 +410,20 @@ bool ModelService::HasModel(const std::string& id) const { cpp::result ModelService::DownloadModelFromCortexsoAsync( - const std::string& name, const std::string& branch, + const std::string& model_name, const std::string& branch, std::optional temp_model_id) { - std::string unique_model_id = temp_model_id.value_or(name + ":" + branch); + std::string unique_model_id = + temp_model_id.value_or(model_name + ":" + branch); auto model_entry = db_service_->GetModelInfo(unique_model_id); if (model_entry.has_value() && model_entry->status == cortex::db::ModelStatus::Downloaded) { return cpp::fail("Please delete the model before downloading again"); } - auto download_task = GetCloneRepoDownloadTask("cortexso", name, branch, - "cortex.so", unique_model_id); + auto download_task = GetCloneRepoDownloadTask( + "cortexso", model_name, branch, {"cortex.so", model_name, branch}, + unique_model_id); if (download_task.has_error()) { return cpp::fail(download_task.error()); } @@ -928,7 +930,7 @@ cpp::result ModelService::GetModelPullInfo( // we will download the whole repo if (repo_info->library_name.value_or("") == "transformers") { return ModelPullInfo{ - .id = author + ":" + model_name, + .id = author + "/" + model_name, .model_source = "huggingface", }; } @@ -991,6 +993,15 @@ cpp::result ModelService::PullModel( return HandleDownloadUrlAsync(model_handle, desired_model_id, desired_model_name); + // HF model handle + if (model_handle.find("/") != std::string::npos) { + const auto author_model = string_utils::SplitBy(model_handle, "/"); + if (author_model.size() != 2) + return cpp::fail("Invalid model handle"); + + return DownloadHfModelAsync(author_model[0], author_model[1]); + } + if (model_handle.find(":") == std::string::npos) return cpp::fail("Invalid model handle or not supported!"); @@ -1002,13 +1013,6 @@ cpp::result ModelService::PullModel( model_and_branch[0], model_and_branch[1], desired_model_id); if (model_and_branch.size() == 3) { - // HF model - // hf:author_id:model_name - // NOTE: this may confuse with the format below, where author_id = "hf" - // https://huggingface.co/hf - if (model_and_branch[0] == "hf") - return DownloadHfModelAsync(model_and_branch[1], model_and_branch[2]); - // single GGUF file // author_id:model_name:filename auto mh = url_parser::Url{ diff --git a/engine/services/model_service.h b/engine/services/model_service.h index 3a8e32963..e61d17171 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -95,8 +95,7 @@ class ModelService { std::optional temp_model_id = std::nullopt); cpp::result DownloadHfModelAsync( - const std::string& author_id, const std::string& model_id, - std::optional temp_model_id = std::nullopt); + const std::string& author_id, const std::string& model_id); cpp::result, std::string> MayFallbackToCpu( const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048, From dc42dddd5ad402a0c0a67e6ac8aba0c7ddb5ccba Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 18 Mar 2025 21:31:54 +0800 Subject: [PATCH 53/73] fix thread-unsafe --- engine/services/model_service.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 1751e52b2..fb188f4ae 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -309,19 +309,21 @@ cpp::result ModelService::DownloadHfModelAsync( model_entry->status == cortex::db::ModelStatus::Downloaded) return cpp::fail("Please delete the model before downloading again"); - const std::string branch = "main"; auto download_task = GetCloneRepoDownloadTask( - author_id, model_id, branch, {"huggingface.co", author_id, model_id}, + author_id, model_id, "main", {"huggingface.co", author_id, model_id}, unique_model_id); if (download_task.has_error()) return download_task; - auto on_finished = [&, this](const DownloadTask& finishedTask) { + // TODO: validate that this is a vllm-compatible model + auto on_finished = [this, author_id, + unique_model_id](const DownloadTask& finishedTask) { if (!db_service_->HasModel(unique_model_id)) { + CTL_INF("Before creating model entry"); cortex::db::ModelEntry model_entry{ .model = unique_model_id, .author_repo_id = author_id, - .branch_name = branch, + .branch_name = "main", .path_to_model_yaml = "", .model_alias = unique_model_id, .status = cortex::db::ModelStatus::Downloaded, From 73fe3e5ed6eb635933a60c2e758fb01ea6d64624 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 10:32:34 +0800 Subject: [PATCH 54/73] remove methods --- engine/extensions/python-engines/vllm_engine.cc | 12 ------------ engine/extensions/python-engines/vllm_engine.h | 8 -------- 2 files changed, 20 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index b549f6c78..db291a4eb 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -125,15 +125,3 @@ void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {}; // Stop inflight chat completion in stream mode void VllmEngine::StopInferencing(const std::string& model_id) {}; - -Json::Value VllmEngine::GetRemoteModels() { - return Json::Value{}; -}; - -void VllmEngine::HandleRouteRequest( - std::shared_ptr json_body, - std::function&& callback) {}; - -void VllmEngine::HandleInference( - std::shared_ptr json_body, - std::function&& callback) {}; diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index 261da1025..2fcfa0d74 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -50,12 +50,4 @@ class VllmEngine : public EngineI { // Stop inflight chat completion in stream mode virtual void StopInferencing(const std::string& model_id) override; - - virtual Json::Value GetRemoteModels() override; - virtual void HandleRouteRequest( - std::shared_ptr json_body, - std::function&& callback) override; - virtual void HandleInference( - std::shared_ptr json_body, - std::function&& callback) override; }; From 7bf287df5c3949429d8400aa3725ffdcc8d743dc Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 10:36:54 +0800 Subject: [PATCH 55/73] remove old remnants --- engine/controllers/server.h | 2 -- engine/cortex-common/python_enginei.h | 27 --------------------------- engine/services/engine_service.cc | 9 +++------ engine/services/inference_service.cc | 16 ---------------- 4 files changed, 3 insertions(+), 51 deletions(-) delete mode 100644 engine/cortex-common/python_enginei.h diff --git a/engine/controllers/server.h b/engine/controllers/server.h index 0dd2d0913..7c8d759b4 100644 --- a/engine/controllers/server.h +++ b/engine/controllers/server.h @@ -43,8 +43,6 @@ class server : public drogon::HttpController, ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Options, Post); ADD_METHOD_TO(server::Embedding, "/v1/embeddings", Options, Post); - ADD_METHOD_TO(server::Python, "/v1/python/{1}/.*", Options, Get, Post); - METHOD_LIST_END void ChatCompletion( diff --git a/engine/cortex-common/python_enginei.h b/engine/cortex-common/python_enginei.h deleted file mode 100644 index 35470f008..000000000 --- a/engine/cortex-common/python_enginei.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include - -#include "json/value.h" -#include "utils/result.hpp" - -class PythonEngineI { - public: - virtual ~PythonEngineI() {} - - // model management - virtual void LoadModel( - std::shared_ptr json_body, - std::function&& callback) = 0; - virtual void UnloadModel( - std::shared_ptr json_body, - std::function&& callback) = 0; - virtual void GetModelStatus( - std::shared_ptr json_body, - std::function&& callback) = 0; - virtual void GetModels( - std::shared_ptr jsonBody, - std::function&& callback) = 0; - - virtual cpp::result GetPort(const std::string& model) = 0; -}; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 25fe1c7a3..e9b256cc3 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -185,10 +185,7 @@ cpp::result EngineService::UninstallEngineVariant( std::optional path_to_remove = std::nullopt; - // Python engine is stored in a separate folder - if (ne == kPythonEngine) { - return cpp::fail("Not implemented"); - } else { + if (ne == kLlamaRepo) { if (version == std::nullopt && variant == std::nullopt) { // if no version and variant provided, remove all engines variant of that engine path_to_remove = file_manager_utils::GetEnginesContainerPath() / ne; @@ -203,6 +200,8 @@ cpp::result EngineService::UninstallEngineVariant( } else { return cpp::fail("No variant provided"); } + } else { + return cpp::fail("Not implemented for engine " + ne); } if (path_to_remove == std::nullopt) { @@ -890,8 +889,6 @@ cpp::result EngineService::UnloadEngine( auto unload_opts = EngineI::EngineUnloadOption{}; e->Unload(unload_opts); delete e; - } else if (std::holds_alternative(engines_[ne].engine)) { - delete std::get(engines_[ne].engine); } else { delete std::get(engines_[ne].engine); } diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index aaff6e65f..4404ac5ea 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -107,9 +107,6 @@ cpp::result InferenceService::HandleChatCompletion( if (std::holds_alternative(engine_result.value())) { std::get(engine_result.value()) ->HandleChatCompletion(json_body, std::move(cb)); - } else if (std::holds_alternative(engine_result.value())) { - return cpp::fail(GetUnsupportedResponse( - "Python engine does not support Chat completion")); } else { std::get(engine_result.value()) ->HandleChatCompletion(json_body, std::move(cb)); @@ -143,9 +140,6 @@ cpp::result InferenceService::HandleEmbedding( if (std::holds_alternative(engine_result.value())) { std::get(engine_result.value()) ->HandleEmbedding(json_body, std::move(cb)); - } else if (std::holds_alternative(engine_result.value())) { - return cpp::fail( - GetUnsupportedResponse("Python engine does not support Embedding")); } else { std::get(engine_result.value()) ->HandleEmbedding(json_body, std::move(cb)); @@ -183,8 +177,6 @@ InferResult InferenceService::LoadModel( }; if (std::holds_alternative(engine)) { std::get(engine)->LoadModel(json_body, std::move(cb)); - } else if (std::holds_alternative(engine)) { - std::get(engine)->LoadModel(json_body, std::move(cb)); } else { std::get(engine)->LoadModel(json_body, std::move(cb)); } @@ -221,9 +213,6 @@ InferResult InferenceService::UnloadModel(const std::string& engine_name, if (std::holds_alternative(engine)) { std::get(engine)->UnloadModel( std::make_shared(json_body), std::move(cb)); - } else if (std::holds_alternative(engine)) { - std::get(engine)->UnloadModel( - std::make_shared(json_body), std::move(cb)); } else { std::get(engine)->UnloadModel( std::make_shared(json_body), std::move(cb)); @@ -262,8 +251,6 @@ InferResult InferenceService::GetModelStatus( auto engine = engine_result.value(); if (std::holds_alternative(engine)) { std::get(engine)->GetModelStatus(json_body, std::move(cb)); - } else if (std::holds_alternative(engine)) { - std::get(engine)->GetModelStatus(json_body, std::move(cb)); } else { std::get(engine)->GetModelStatus(json_body, std::move(cb)); } @@ -296,9 +283,6 @@ InferResult InferenceService::GetModels( if (e->IsSupported("GetModels")) { e->GetModels(json_body, std::move(cb)); } - } else if (std::holds_alternative(loaded_engine)) { - std::get(loaded_engine) - ->GetModels(json_body, std::move(cb)); } else { std::get(loaded_engine) ->GetModels(json_body, std::move(cb)); From 2a2b607cee4f3c21868353ae652b3bd17c1c0e70 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 12:09:51 +0800 Subject: [PATCH 56/73] support models list. add --relocatable for venv --- engine/controllers/models.cc | 11 +++++ .../extensions/python-engines/vllm_engine.cc | 1 + engine/services/model_service.cc | 46 +++++++++++-------- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc index 1c937764a..392e8b5dd 100644 --- a/engine/controllers/models.cc +++ b/engine/controllers/models.cc @@ -178,6 +178,17 @@ void Models::ListModel( data.append(std::move(obj)); continue; } + + if (model_entry.engine == kVllmEngine) { + Json::Value obj; + obj["id"] = model_entry.model; + obj["model"] = model_entry.model; + obj["engine"] = model_entry.engine; + obj["status"] = "downloaded"; + data.append(std::move(obj)); + continue; + } + yaml_handler.ModelConfigFromFile( fmu::ToAbsoluteCortexDataPath( fs::path(model_entry.path_to_model_yaml)) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index db291a4eb..725cb3bfd 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -57,6 +57,7 @@ cpp::result VllmEngine::Download( if (!std::filesystem::exists(vllm_path / ".venv")) { std::vector cmd = python_utils::BuildUvCommand("venv", vllm_path_str); + cmd.push_back("--relocatable"); auto result = cortex::process::SpawnProcess(cmd); if (result.has_error()) return cpp::fail(result.error()); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 592b3928d..36b2e013c 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -179,7 +179,12 @@ void ModelService::ForceIndexingModelList() { if (model_entry.status != cortex::db::ModelStatus::Downloaded) { continue; } + if (model_entry.engine == kVllmEngine) { + // TODO: check if folder still exists? + continue; + } try { + // check if path_to_model_yaml still exists CTL_DBG(fmu::ToAbsoluteCortexDataPath( fs::path(model_entry.path_to_model_yaml)) .string()); @@ -590,14 +595,20 @@ cpp::result ModelService::StartModel( Json::Value json_data; // Currently we don't support download vision models, so we need to bypass check if (!bypass_model_check) { - auto model_entry = db_service_->GetModelInfo(model_handle); - if (model_entry.has_error()) { - CTL_WRN("Error: " + model_entry.error()); - return cpp::fail(model_entry.error()); + auto result = db_service_->GetModelInfo(model_handle); + if (result.has_error()) { + CTL_WRN("Error: " + result.error()); + return cpp::fail(result.error()); } + auto model_entry = result.value(); + + if (model_entry.engine == kVllmEngine) { + return cpp::fail("vLLM engine models are not supported yet."); + } + yaml_handler.ModelConfigFromFile( fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) + fs::path(model_entry.path_to_model_yaml)) .string()); auto mc = yaml_handler.GetModelConfig(); @@ -605,17 +616,15 @@ cpp::result ModelService::StartModel( if (engine_svc_->IsRemoteEngine(mc.engine)) { (void)engine_svc_->LoadEngine(mc.engine); config::RemoteModelConfig remote_mc; - remote_mc.LoadFromYamlFile( - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string()); - auto remote_engine_entry = - engine_svc_->GetEngineByNameAndVariant(mc.engine); - if (remote_engine_entry.has_error()) { - CTL_WRN("Remote engine error: " + model_entry.error()); - return cpp::fail(remote_engine_entry.error()); + remote_mc.LoadFromYamlFile(fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.path_to_model_yaml)) + .string()); + auto result = engine_svc_->GetEngineByNameAndVariant(mc.engine); + if (result.has_error()) { + CTL_WRN("Remote engine error: " + result.error()); + return cpp::fail(result.error()); } - auto remote_engine_json = remote_engine_entry.value().ToJson(); + auto remote_engine_json = result.value().ToJson(); json_data = remote_mc.ToJson(); json_data["api_key"] = std::move(remote_engine_json["api_key"]); @@ -623,10 +632,9 @@ cpp::result ModelService::StartModel( !v.empty() && v != "latest") { json_data["version"] = v; } - json_data["model_path"] = - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string(); + json_data["model_path"] = fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.path_to_model_yaml)) + .string(); json_data["metadata"] = std::move(remote_engine_json["metadata"]); auto ir = From fffc686b585834932befd1485289a70808e8030e Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 13:35:15 +0800 Subject: [PATCH 57/73] preparation works for start model --- .../extensions/python-engines/vllm_engine.cc | 24 ++++++++++++++++ .../extensions/python-engines/vllm_engine.h | 28 ++++++++++--------- engine/services/engine_service.cc | 25 +++++------------ engine/services/model_service.cc | 16 ++++++++--- 4 files changed, 58 insertions(+), 35 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 725cb3bfd..c7ba66793 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -1,6 +1,7 @@ #include "vllm_engine.h" #include "utils/curl_utils.h" #include "utils/logging_utils.h" +#include "utils/system_info_utils.h" namespace { cpp::result GetLatestVllmVersion() { @@ -28,6 +29,12 @@ VllmEngine::~VllmEngine() { cpp::result VllmEngine::Download( std::shared_ptr& download_service, const std::string& version, const std::optional variant_name) { + auto system_info = system_info_utils::GetSystemInfo(); + if (!(system_info->os == kLinuxOs && system_info->arch == "amd64" && + system_info_utils::IsNvidiaSmiAvailable())) + return cpp::fail( + "vLLM engine is only supported on Linux x86_64 with Nvidia GPU."); + if (variant_name.has_value()) { return cpp::fail("variant_name must be empty"); } @@ -85,6 +92,23 @@ cpp::result VllmEngine::Download( return {}; } +std::vector VllmEngine::GetVariants() { + const auto vllm_path = python_utils::GetEnvsPath() / "vllm"; + + namespace fs = std::filesystem; + if (!fs::exists(vllm_path)) + return {}; + + std::vector variants; + for (const auto& entry : fs::directory_iterator(vllm_path)) { + const auto name = "linux-amd64-cuda"; // arbitrary + const auto version_str = "v" + entry.path().filename().string(); + const EngineVariantResponse variant{name, version_str, kVllmEngine}; + variants.push_back(variant); + } + return variants; +} + void VllmEngine::Load(EngineLoadOption opts) {}; void VllmEngine::Unload(EngineUnloadOption opts) {}; diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index 2fcfa0d74..35a97a903 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -1,3 +1,4 @@ +#include "common/engine_servicei.h" #include "cortex-common/EngineI.h" #include "python_utils.h" @@ -16,38 +17,39 @@ class VllmEngine : public EngineI { const std::string& version, const std::optional variant_name); - virtual void Load(EngineLoadOption opts) override; - virtual void Unload(EngineUnloadOption opts) override; + static std::vector GetVariants(); + + void Load(EngineLoadOption opts) override; + void Unload(EngineUnloadOption opts) override; // cortex.llamacpp interface - virtual void HandleChatCompletion( + void HandleChatCompletion( std::shared_ptr json_body, std::function&& callback) override; - virtual void HandleEmbedding( + void HandleEmbedding( std::shared_ptr json_body, std::function&& callback) override; - virtual void LoadModel( + void LoadModel( std::shared_ptr json_body, std::function&& callback) override; - virtual void UnloadModel( + void UnloadModel( std::shared_ptr json_body, std::function&& callback) override; - virtual void GetModelStatus( + void GetModelStatus( std::shared_ptr json_body, std::function&& callback) override; // For backward compatible checking - virtual bool IsSupported(const std::string& f) override; + bool IsSupported(const std::string& f) override; // Get list of running models - virtual void GetModels( + void GetModels( std::shared_ptr jsonBody, std::function&& callback) override; - virtual bool SetFileLogger(int max_log_lines, - const std::string& log_path) override; - virtual void SetLogLevel(trantor::Logger::LogLevel logLevel) override; + bool SetFileLogger(int max_log_lines, const std::string& log_path) override; + void SetLogLevel(trantor::Logger::LogLevel logLevel) override; // Stop inflight chat completion in stream mode - virtual void StopInferencing(const std::string& model_id) override; + void StopInferencing(const std::string& model_id) override; }; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index e9b256cc3..b8a3b13d6 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -620,22 +620,8 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const { auto ne = cortex::engine::NormalizeEngine(engine); auto os = hw_inf_.sys_inf->os; - // if (ne == kPythonEngine) { - // if (!python_engine::IsUvInstalled()) { - // return {}; - // } else { - // // Python engine only means uv is installed. - // // variant name and version don't quite make sense in this context. - // // hence, they are left blank. - // std::vector variants; - // variants.push_back(EngineVariantResponse{ - // .name = "", - // .version = "", - // .engine = kPythonEngine, - // }); - // return variants; - // } - // } + if (ne == kVllmEngine) + return VllmEngine::GetVariants(); auto engines_variants_dir = file_manager_utils::GetEnginesContainerPath() / ne; @@ -931,8 +917,6 @@ cpp::result EngineService::IsEngineReady( return true; } - auto os = hw_inf_.sys_inf->os; - auto installed_variants = GetInstalledEngineVariants(engine); if (installed_variants.has_error()) { return cpp::fail(installed_variants.error()); @@ -1119,6 +1103,11 @@ cpp::result EngineService::GetRemoteModels( bool EngineService::IsRemoteEngine(const std::string& engine_name) const { auto ne = Repo2Engine(engine_name); + + if (ne == kLlamaEngine || ne == kVllmEngine) + return false; + return true; + auto local_engines = file_manager_utils::GetCortexConfig().supportedEngines; for (auto const& le : local_engines) { if (le == ne) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 36b2e013c..119e12b75 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -208,10 +208,18 @@ std::optional ModelService::GetDownloadedModel( const std::string& modelId) const { config::YamlHandler yaml_handler; - auto model_entry = db_service_->GetModelInfo(modelId); - if (!model_entry.has_value()) { + auto result = db_service_->GetModelInfo(modelId); + if (result.has_error()) { return std::nullopt; } + auto model_entry = result.value(); + + // ignore all other params + if (model_entry.engine == kVllmEngine) { + config::ModelConfig cfg; + cfg.engine = kVllmEngine; + return cfg; + } try { config::YamlHandler yaml_handler; @@ -219,11 +227,11 @@ std::optional ModelService::GetDownloadedModel( namespace fmu = file_manager_utils; yaml_handler.ModelConfigFromFile( fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) + fs::path(model_entry.path_to_model_yaml)) .string()); return yaml_handler.GetModelConfig(); } catch (const std::exception& e) { - LOG_ERROR << "Error reading yaml file '" << model_entry->path_to_model_yaml + LOG_ERROR << "Error reading yaml file '" << model_entry.path_to_model_yaml << "': " << e.what(); return std::nullopt; } From cea8020291181227816af1a025a59278b2c38b5c Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 15:56:15 +0800 Subject: [PATCH 58/73] add sync download util. add vLLM version config. some boilerplate code to launch model (non-functional atm) --- .../extensions/python-engines/python_utils.cc | 79 +++---- .../extensions/python-engines/python_utils.h | 3 +- .../extensions/python-engines/vllm_engine.cc | 220 +++++++++++------- .../extensions/python-engines/vllm_engine.h | 6 +- engine/services/engine_service.cc | 106 ++++++++- engine/services/engine_service.h | 4 + engine/services/model_service.cc | 19 +- engine/utils/config_yaml_utils.cc | 7 +- engine/utils/config_yaml_utils.h | 1 + engine/utils/curl_utils.cc | 44 ++++ engine/utils/curl_utils.h | 7 +- 11 files changed, 342 insertions(+), 154 deletions(-) diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc index 5255fcd0e..07297801e 100644 --- a/engine/extensions/python-engines/python_utils.cc +++ b/engine/extensions/python-engines/python_utils.cc @@ -2,6 +2,7 @@ #include #include "utils/archive_utils.h" +#include "utils/curl_utils.h" #include "utils/file_manager_utils.h" #include "utils/set_permission_utils.h" #include "utils/system_info_utils.h" @@ -23,8 +24,7 @@ std::filesystem::path GetUvPath() { bool IsUvInstalled() { return std::filesystem::exists(GetUvPath()); } -cpp::result InstallUv( - std::shared_ptr& download_service) { +cpp::result InstallUv() { const auto py_bin_path = GetPythonEnginesPath() / "bin"; std::filesystem::create_directories(py_bin_path); @@ -58,52 +58,37 @@ cpp::result InstallUv( const std::string url = url_stream.str(); CTL_INF("Download uv from " << url); - auto on_finished = [py_bin_path, - uv_version](const DownloadTask& finishedTask) { - // try to unzip the downloaded file - const std::string download_path = finishedTask.items[0].localPath.string(); - - archive_utils::ExtractArchive(download_path, py_bin_path.string(), true); - set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); - std::filesystem::remove(download_path); - - // install Python3.10 from Astral. this will be preferred over system - // Python when possible. - // NOTE: currently this will install to a user-wide directory. we can - // install to a specific location using `--install-dir`, but later - // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use - // this Python installation. - // we can add this once we allow passing custom env var to SpawnProcess(). - // https://docs.astral.sh/uv/reference/cli/#uv-python-install - std::vector command = BuildUvCommand("python"); - command.push_back("install"); - command.push_back("3.10"); - - // NOTE: errors in download callback won't be propagated to caller - auto result = cortex::process::SpawnProcess(command); - if (result.has_error()) { - CTL_ERR(result.error()); - return; - } - - if (!cortex::process::WaitProcess(result.value())) { - CTL_ERR("Process spawned but fail to wait"); - return; - } - }; - - auto downloadTask = DownloadTask{.id = "python-uv", - .type = DownloadType::Engine, - .items = {DownloadItem{ - .id = "python-uv", - .downloadUrl = url, - .localPath = py_bin_path / fname, - }}}; - - auto add_task_result = download_service->AddTask(downloadTask, on_finished); - if (add_task_result.has_error()) { - return cpp::fail(add_task_result.error()); + const auto save_path = py_bin_path / fname; + auto res = curl_utils::SimpleDownload(url, save_path.string()); + if (res.has_error()) + return res; + + archive_utils::ExtractArchive(save_path, py_bin_path.string(), true); + set_permission_utils::SetExecutePermissionsRecursive(py_bin_path); + std::filesystem::remove(save_path); + + // install Python3.10 from Astral. this will be preferred over system + // Python when possible. + // NOTE: currently this will install to a user-wide directory. we can + // install to a specific location using `--install-dir`, but later + // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use + // this Python installation. + // we can add this once we allow passing custom env var to SpawnProcess(). + // https://docs.astral.sh/uv/reference/cli/#uv-python-install + std::vector command = BuildUvCommand("python"); + command.push_back("install"); + command.push_back("3.10"); + + auto result = cortex::process::SpawnProcess(command); + if (result.has_error()) + return cpp::fail(result.error()); + + if (!cortex::process::WaitProcess(result.value())) { + const auto msg = "Process spawned but fail to wait"; + CTL_ERR(msg); + return cpp::fail(msg); } + return {}; } diff --git a/engine/extensions/python-engines/python_utils.h b/engine/extensions/python-engines/python_utils.h index 31b0ca0ad..97b2d3f15 100644 --- a/engine/extensions/python-engines/python_utils.h +++ b/engine/extensions/python-engines/python_utils.h @@ -15,8 +15,7 @@ std::filesystem::path GetUvPath(); // UV-related functions bool IsUvInstalled(); -cpp::result InstallUv( - std::shared_ptr& download_service); +cpp::result InstallUv(); std::vector BuildUvCommand(const std::string& action, const std::string& directory = ""); // cpp::result UvDownloadDeps( diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index c7ba66793..4229c32df 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -1,21 +1,29 @@ #include "vllm_engine.h" +#include "services/engine_service.h" #include "utils/curl_utils.h" #include "utils/logging_utils.h" #include "utils/system_info_utils.h" -namespace { -cpp::result GetLatestVllmVersion() { - auto result = curl_utils::SimpleGetJson("https://pypi.org/pypi/vllm/json"); - if (result.has_error()) - return result.error(); +static std::pair CreateResponse( + const std::string& msg, int code) { - auto version_value = result.value()["info"]["version"]; - if (version_value.isNull()) - return cpp::fail("Can't find version in the response"); + Json::Value status, res; + const bool has_error = code != 200; - return version_value.asString(); + status["is_done"] = true; + status["has_error"] = has_error; + status["is_stream"] = false; + status["status_code"] = code; + + if (has_error) { + CTL_ERR(msg); + res["error"] = msg; + } else { + res["status"] = msg; + } + + return {status, res}; } -} // namespace VllmEngine::~VllmEngine() { // NOTE: what happens if we can't kill subprocess? @@ -26,72 +34,6 @@ VllmEngine::~VllmEngine() { } } -cpp::result VllmEngine::Download( - std::shared_ptr& download_service, - const std::string& version, const std::optional variant_name) { - auto system_info = system_info_utils::GetSystemInfo(); - if (!(system_info->os == kLinuxOs && system_info->arch == "amd64" && - system_info_utils::IsNvidiaSmiAvailable())) - return cpp::fail( - "vLLM engine is only supported on Linux x86_64 with Nvidia GPU."); - - if (variant_name.has_value()) { - return cpp::fail("variant_name must be empty"); - } - - if (!python_utils::IsUvInstalled()) { - auto result = python_utils::InstallUv(download_service); - if (result.has_error()) - return result; - } - - std::string concrete_version = version; - if (version == "latest") { - auto result = GetLatestVllmVersion(); - if (result.has_error()) - return cpp::fail(result.error()); - - concrete_version = result.value(); - } - CTL_INF("Download vLLM " << concrete_version); - - const auto vllm_path = - python_utils::GetEnvsPath() / "vllm" / concrete_version; - std::filesystem::create_directories(vllm_path); - const auto vllm_path_str = vllm_path.string(); - - // initialize venv - if (!std::filesystem::exists(vllm_path / ".venv")) { - std::vector cmd = - python_utils::BuildUvCommand("venv", vllm_path_str); - cmd.push_back("--relocatable"); - auto result = cortex::process::SpawnProcess(cmd); - if (result.has_error()) - return cpp::fail(result.error()); - - // TODO: check return code - // NOTE: these are not async - cortex::process::WaitProcess(result.value()); - } - - // install vLLM - { - std::vector cmd = - python_utils::BuildUvCommand("pip", vllm_path_str); - cmd.push_back("install"); - cmd.push_back("vllm==" + concrete_version); - auto result = cortex::process::SpawnProcess(cmd); - if (result.has_error()) - return cpp::fail(result.error()); - - // TODO: check return code - // NOTE: these are not async - cortex::process::WaitProcess(result.value()); - } - - return {}; -} - std::vector VllmEngine::GetVariants() { const auto vllm_path = python_utils::GetEnvsPath() / "vllm"; @@ -109,29 +51,123 @@ std::vector VllmEngine::GetVariants() { return variants; } -void VllmEngine::Load(EngineLoadOption opts) {}; -void VllmEngine::Unload(EngineUnloadOption opts) {}; +// NOTE: doesn't do anything +void VllmEngine::Load(EngineLoadOption opts) { + CTL_WRN("EngineLoadOption is ignored"); + return; +}; + +// NOTE: doesn't do anything +void VllmEngine::Unload(EngineUnloadOption opts) { + return; +}; // cortex.llamacpp interface void VllmEngine::HandleChatCompletion( std::shared_ptr json_body, - std::function&& callback) {}; + std::function&& callback) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; void VllmEngine::HandleEmbedding( std::shared_ptr json_body, - std::function&& callback) {}; + std::function&& callback) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; void VllmEngine::LoadModel( std::shared_ptr json_body, - std::function&& callback) {}; + std::function&& callback) { + + if (!json_body->isMember("model")) { + auto [status, error] = + CreateResponse("Missing required fields: model", 400); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + + { + std::unique_lock write_lock(mutex); + if (model_process_map.find(model) != model_process_map.end()) { + // check if model is still alive + if (model_process_map[model].IsAlive()) { + auto [status, error] = CreateResponse("Model already loaded!", 409); + callback(std::move(status), std::move(error)); + return; + } else { + // if model has exited, try to load model again + CTL_WRN("Model " << model << " has exited unexpectedly"); + model_process_map.erase(model); + } + } + } + + // pid_t pid; + // try { + // // https://docs.astral.sh/uv/reference/cli/#uv-run + // std::vector command = + // python_utils::BuildUvCommand("run", model_dir.string()); + // for (const auto& item : py_cfg.entrypoint) + // command.push_back(item); + + // const std::string stdout_path = (model_dir / "stdout.txt").string(); + // const std::string stderr_path = (model_dir / "stderr.txt").string(); + + // // create empty stdout.txt and stderr.txt for redirection + // if (!std::filesystem::exists(stdout_path)) + // std::ofstream(stdout_path).flush(); + // if (!std::filesystem::exists(stderr_path)) + // std::ofstream(stderr_path).flush(); + + // auto result = + // cortex::process::SpawnProcess(command, stdout_path, stderr_path); + // if (result.has_error()) { + // throw std::runtime_error(result.error()); + // } + + // PythonSubprocess py_proc; + // py_proc.proc_info = result.value(); + // py_proc.port = py_cfg.port; + // py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() / + // std::chrono::milliseconds(1); + + // pid = py_proc.proc_info.pid; + + // std::unique_lock write_lock(mutex); + // model_process_map[model] = py_proc; + + // } catch (const std::exception& e) { + // auto e_msg = e.what(); + // auto [status, error] = CreateResponse(e_msg, k500InternalServerError); + // callback(std::move(status), std::move(error)); + // return; + // } + + // auto [status, res] = CreateResponse( + // "Model loaded successfully with pid: " + std::to_string(pid), k200OK); + // callback(std::move(status), std::move(res)); + + // CTL_WRN("Not implemented"); + // throw std::runtime_error("Not implemented"); +}; void VllmEngine::UnloadModel( std::shared_ptr json_body, - std::function&& callback) {}; + std::function&& callback) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; void VllmEngine::GetModelStatus( std::shared_ptr json_body, - std::function&& callback) {}; + std::function&& callback) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; // For backward compatible checking bool VllmEngine::IsSupported(const std::string& f) { @@ -141,12 +177,22 @@ bool VllmEngine::IsSupported(const std::string& f) { // Get list of running models void VllmEngine::GetModels( std::shared_ptr jsonBody, - std::function&& callback) {}; + std::function&& callback) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; bool VllmEngine::SetFileLogger(int max_log_lines, const std::string& log_path) { - return true; + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; +void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); }; -void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) {}; // Stop inflight chat completion in stream mode -void VllmEngine::StopInferencing(const std::string& model_id) {}; +void VllmEngine::StopInferencing(const std::string& model_id) { + CTL_WRN("Not implemented"); + throw std::runtime_error("Not implemented"); +}; diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index 35a97a903..a6024185e 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -1,3 +1,4 @@ +#include #include "common/engine_servicei.h" #include "cortex-common/EngineI.h" #include "python_utils.h" @@ -12,11 +13,6 @@ class VllmEngine : public EngineI { VllmEngine() {}; ~VllmEngine(); - static cpp::result Download( - std::shared_ptr& download_service, - const std::string& version, - const std::optional variant_name); - static std::vector GetVariants(); void Load(EngineLoadOption opts) override; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index b8a3b13d6..80321e18d 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -225,11 +225,11 @@ cpp::result EngineService::DownloadEngine( const std::string& engine, const std::string& version, const std::optional variant_name) { - if (engine == kLlamaRepo) { + if (engine == kLlamaRepo) return DownloadLlamaCpp(version, variant_name); - } else if (engine == kVllmEngine) { - return VllmEngine::Download(download_service_, version, variant_name); - } + if (engine == kVllmEngine) + return DownloadVllm(version, variant_name); + return cpp::fail("Unknown engine " + engine); } @@ -372,6 +372,83 @@ cpp::result EngineService::DownloadLlamaCpp( return {}; } +cpp::result EngineService::DownloadVllm( + const std::string& version, const std::optional variant_name) { + + auto system_info = system_info_utils::GetSystemInfo(); + if (!(system_info->os == kLinuxOs && system_info->arch == "amd64" && + system_info_utils::IsNvidiaSmiAvailable())) + return cpp::fail( + "vLLM engine is only supported on Linux x86_64 with Nvidia GPU."); + + if (variant_name.has_value()) { + return cpp::fail("variant_name must be empty"); + } + + // NOTE: everything below is not async + // to make it async, we have to run everything in a thread (spawning and waiting + // for subprocesses) + if (!python_utils::IsUvInstalled()) { + auto result = python_utils::InstallUv(); + if (result.has_error()) + return result; + } + + std::string concrete_version = version; + if (version == "latest") { + auto result = curl_utils::SimpleGetJson("https://pypi.org/pypi/vllm/json"); + if (result.has_error()) + return cpp::fail(result.error()); + + auto version_value = result.value()["info"]["version"]; + if (version_value.isNull()) + return cpp::fail("Can't find version in the response"); + concrete_version = version_value.asString(); + } + CTL_INF("Download vLLM " << concrete_version); + + const auto vllm_path = + python_utils::GetEnvsPath() / "vllm" / concrete_version; + std::filesystem::create_directories(vllm_path); + const auto vllm_path_str = vllm_path.string(); + + // initialize venv + if (!std::filesystem::exists(vllm_path / ".venv")) { + std::vector cmd = + python_utils::BuildUvCommand("venv", vllm_path_str); + cmd.push_back("--relocatable"); + auto result = cortex::process::SpawnProcess(cmd); + if (result.has_error()) + return cpp::fail(result.error()); + + // TODO: check return code + // NOTE: these are not async + cortex::process::WaitProcess(result.value()); + } + + // install vLLM + { + std::vector cmd = + python_utils::BuildUvCommand("pip", vllm_path_str); + cmd.push_back("install"); + cmd.push_back("vllm==" + concrete_version); + auto result = cortex::process::SpawnProcess(cmd); + if (result.has_error()) + return cpp::fail(result.error()); + + // TODO: check return code + // one reason this may fail is that the requested version does not exist + // NOTE: these are not async + cortex::process::WaitProcess(result.value()); + } + + auto result = SetDefaultEngineVariant(kVllmEngine, concrete_version, ""); + if (result.has_error()) + return cpp::fail(result.error()); + + return {}; +} + cpp::result EngineService::DownloadCuda( const std::string& engine, bool async) { if (hw_inf_.sys_inf->os == "mac" || engine != kLlamaRepo) { @@ -553,8 +630,14 @@ EngineService::SetDefaultEngineVariant(const std::string& engine, auto normalized_version = string_utils::RemoveSubstring(version, "v"); auto config = file_manager_utils::GetCortexConfig(); - config.llamacppVersion = "v" + normalized_version; - config.llamacppVariant = variant; + if (ne == kLlamaRepo) { + config.llamacppVersion = "v" + normalized_version; + config.llamacppVariant = variant; + } else if (ne == kVllmEngine) { + config.vllmVersion = "v" + normalized_version; + } else { + return cpp::fail("Unrecognized engine " + engine); + } auto result = file_manager_utils::UpdateCortexConfig(config); if (result.has_error()) { return cpp::fail(result.error()); @@ -686,6 +769,7 @@ cpp::result EngineService::LoadEngine( CTL_INF("Engine " << ne << " is already loaded"); return {}; } + CTL_INF("Loading engine: " << ne); // Check for remote engine if (IsRemoteEngine(engine_name)) { @@ -702,9 +786,17 @@ cpp::result EngineService::LoadEngine( return {}; } + // check for vLLM engine + if (engine_name == kVllmEngine) { + auto engine = new VllmEngine(); + EngineI::EngineLoadOption load_opts{}; + engine->Load(load_opts); + engines_[engine_name].engine = engine; + return {}; + } + // End hard code - CTL_INF("Loading engine: " << ne); #if defined(_WIN32) || defined(_WIN64) || defined(__linux__) CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string()); #endif diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index a4328f4d2..8e745f55f 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -166,6 +166,10 @@ class EngineService : public EngineServiceI { const std::string& version = "latest", const std::optional variant_name = std::nullopt); + cpp::result DownloadVllm( + const std::string& version = "latest", + const std::optional variant_name = std::nullopt); + cpp::result DownloadCuda(const std::string& engine, bool async = false); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 119e12b75..bccc5f842 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -226,8 +226,7 @@ std::optional ModelService::GetDownloadedModel( namespace fs = std::filesystem; namespace fmu = file_manager_utils; yaml_handler.ModelConfigFromFile( - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.path_to_model_yaml)) + fmu::ToAbsoluteCortexDataPath(fs::path(model_entry.path_to_model_yaml)) .string()); return yaml_handler.GetModelConfig(); } catch (const std::exception& e) { @@ -611,7 +610,21 @@ cpp::result ModelService::StartModel( auto model_entry = result.value(); if (model_entry.engine == kVllmEngine) { - return cpp::fail("vLLM engine models are not supported yet."); + Json::Value json_data; + json_data["model"] = model_handle; + json_data["engine"] = kVllmEngine; + auto [status, data] = + inference_svc_->LoadModel(std::make_shared(json_data)); + + auto status_code = status["status_code"].asInt(); + if (status == drogon::k200OK) { + return StartModelResult{true, "vLLM engine ignores all params override"}; + } else if (status == drogon::k409Conflict) { + CTL_INF("Model '" + model_handle + "' is already loaded"); + return StartModelResult{.success = true, .warning = ""}; + } else { + return cpp::fail("Model failed to start: " + data["message"].asString()); + } } yaml_handler.ModelConfigFromFile( diff --git a/engine/utils/config_yaml_utils.cc b/engine/utils/config_yaml_utils.cc index dc47590c4..584929da4 100644 --- a/engine/utils/config_yaml_utils.cc +++ b/engine/utils/config_yaml_utils.cc @@ -36,6 +36,7 @@ cpp::result CortexConfigMgr::DumpYamlConfig( node["gitHubToken"] = config.gitHubToken; node["llamacppVariant"] = config.llamacppVariant; node["llamacppVersion"] = config.llamacppVersion; + node["vllmVersion"] = config.vllmVersion; node["enableCors"] = config.enableCors; node["allowedOrigins"] = config.allowedOrigins; node["proxyUrl"] = config.proxyUrl; @@ -80,7 +81,8 @@ CortexConfig CortexConfigMgr::FromYaml(const std::string& path, !node["logOnnxPath"] || !node["huggingFaceToken"] || !node["gitHubUserAgent"] || !node["gitHubToken"] || !node["llamacppVariant"] || !node["llamacppVersion"] || - !node["enableCors"] || !node["allowedOrigins"] || !node["proxyUrl"] || + !node["vllmVersion"] || !node["enableCors"] || + !node["allowedOrigins"] || !node["proxyUrl"] || !node["proxyUsername"] || !node["proxyPassword"] || !node["verifyPeerSsl"] || !node["verifyHostSsl"] || !node["verifyProxySsl"] || !node["verifyProxyHostSsl"] || @@ -138,6 +140,9 @@ CortexConfig CortexConfigMgr::FromYaml(const std::string& path, .llamacppVersion = node["llamacppVersion"] ? node["llamacppVersion"].as() : default_cfg.llamacppVersion, + .vllmVersion = node["vllmVersion"] + ? node["vllmVersion"].as() + : default_cfg.vllmVersion, .enableCors = node["enableCors"] ? node["enableCors"].as() : default_cfg.enableCors, .allowedOrigins = diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h index c871fd100..fab535a88 100644 --- a/engine/utils/config_yaml_utils.h +++ b/engine/utils/config_yaml_utils.h @@ -48,6 +48,7 @@ struct CortexConfig { std::string gitHubToken; std::string llamacppVariant; std::string llamacppVersion; + std::string vllmVersion; bool enableCors; std::vector allowedOrigins; diff --git a/engine/utils/curl_utils.cc b/engine/utils/curl_utils.cc index 859c629d1..cfe847e04 100644 --- a/engine/utils/curl_utils.cc +++ b/engine/utils/curl_utils.cc @@ -373,4 +373,48 @@ cpp::result SimplePatchJson(const std::string& url, return root; } + +cpp::result SimpleDownload(const std::string& url, + const std::string& save_path, + const int timeout) { + auto curl = curl_easy_init(); + if (!curl) { + return cpp::fail("Failed to init CURL"); + } + + auto headers = GetHeaders(url); + curl_slist* curl_headers = nullptr; + if (headers) { + for (const auto& [key, value] : headers->m) { + auto header = key + ": " + value; + curl_headers = curl_slist_append(curl_headers, header.c_str()); + } + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, curl_headers); + } + + auto file = fopen(save_path.c_str(), "wb"); + if (!file) + return cpp::fail("Failed to open " + save_path); + + SetUpProxy(curl, url); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fwrite); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, file); + if (timeout > 0) { + curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); + } + + // Perform the request + auto res = curl_easy_perform(curl); + fclose(file); + curl_slist_free_all(curl_headers); + curl_easy_cleanup(curl); + if (res != CURLE_OK) { + return cpp::fail("CURL request failed: " + + std::string{curl_easy_strerror(res)}); + } + + return {}; +} } // namespace curl_utils \ No newline at end of file diff --git a/engine/utils/curl_utils.h b/engine/utils/curl_utils.h index 9035b6b3c..91a67077e 100644 --- a/engine/utils/curl_utils.h +++ b/engine/utils/curl_utils.h @@ -37,8 +37,8 @@ cpp::result ReadRemoteYaml(const std::string& url); */ cpp::result SimpleGetJson(const std::string& url, const int timeout = -1); -cpp::result SimpleGetJsonRecursive(const std::string& url, - const int timeout = -1); +cpp::result SimpleGetJsonRecursive( + const std::string& url, const int timeout = -1); cpp::result SimplePostJson( const std::string& url, const std::string& body = ""); @@ -49,4 +49,7 @@ cpp::result SimpleDeleteJson( cpp::result SimplePatchJson( const std::string& url, const std::string& body = ""); +cpp::result SimpleDownload(const std::string& url, + const std::string& save_path, + const int timeout = -1); } // namespace curl_utils From 86d4c01280542716e8cc54151ce378001e91ed63 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 16:00:45 +0800 Subject: [PATCH 59/73] list engines --- engine/services/engine_service.cc | 1 + engine/utils/config_yaml_utils.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 80321e18d..26e4427a0 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -1210,5 +1210,6 @@ bool EngineService::IsRemoteEngine(const std::string& engine_name) const { cpp::result, std::string> EngineService::GetSupportedEngineNames() { + return config_yaml_utils::kDefaultSupportedEngines; return file_manager_utils::GetCortexConfig().supportedEngines; } diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h index fab535a88..f41b00e54 100644 --- a/engine/utils/config_yaml_utils.h +++ b/engine/utils/config_yaml_utils.h @@ -24,7 +24,7 @@ constexpr const auto kDefaultCorsEnabled = true; const std::vector kDefaultEnabledOrigins{ "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"}; constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1"; -const std::vector kDefaultSupportedEngines{kLlamaEngine}; +const std::vector kDefaultSupportedEngines{kLlamaEngine, kVllmEngine}; struct CortexConfig { std::string logFolderPath; From ec8b36d76dbbedea6f5ce5e8cdd5765f5e86f681 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 17:17:30 +0800 Subject: [PATCH 60/73] load and unload model --- .../extensions/python-engines/vllm_engine.cc | 192 ++++++++++++------ .../extensions/python-engines/vllm_engine.h | 8 +- engine/services/engine_service.cc | 34 +++- engine/services/model_service.cc | 39 ++-- 4 files changed, 186 insertions(+), 87 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 4229c32df..6dbda426b 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -4,6 +4,10 @@ #include "utils/logging_utils.h" #include "utils/system_info_utils.h" +namespace { +// figure out port of current running process +const constexpr int CORTEX_PORT = 3928; + static std::pair CreateResponse( const std::string& msg, int code) { @@ -24,6 +28,10 @@ static std::pair CreateResponse( return {status, res}; } +} // namespace + +// cortex_port + 0 is always used (by cortex itself) +VllmEngine::VllmEngine() : port_offsets_{true} {} VllmEngine::~VllmEngine() { // NOTE: what happens if we can't kill subprocess? @@ -51,18 +59,15 @@ std::vector VllmEngine::GetVariants() { return variants; } -// NOTE: doesn't do anything void VllmEngine::Load(EngineLoadOption opts) { - CTL_WRN("EngineLoadOption is ignored"); + version_ = opts.engine_path; // engine path actually contains version info + if (version_[0] == 'v') + version_ = version_.substr(1); return; }; -// NOTE: doesn't do anything -void VllmEngine::Unload(EngineUnloadOption opts) { - return; -}; +void VllmEngine::Unload(EngineUnloadOption opts) {}; -// cortex.llamacpp interface void VllmEngine::HandleChatCompletion( std::shared_ptr json_body, std::function&& callback) { @@ -93,73 +98,142 @@ void VllmEngine::LoadModel( { std::unique_lock write_lock(mutex); if (model_process_map.find(model) != model_process_map.end()) { - // check if model is still alive - if (model_process_map[model].IsAlive()) { + auto proc = model_process_map[model]; + + if (proc.IsAlive()) { auto [status, error] = CreateResponse("Model already loaded!", 409); callback(std::move(status), std::move(error)); return; } else { - // if model has exited, try to load model again + // if model has exited, try to load model again? CTL_WRN("Model " << model << " has exited unexpectedly"); model_process_map.erase(model); + port_offsets_[proc.port - CORTEX_PORT] = false; // free the port } } } - // pid_t pid; - // try { - // // https://docs.astral.sh/uv/reference/cli/#uv-run - // std::vector command = - // python_utils::BuildUvCommand("run", model_dir.string()); - // for (const auto& item : py_cfg.entrypoint) - // command.push_back(item); - - // const std::string stdout_path = (model_dir / "stdout.txt").string(); - // const std::string stderr_path = (model_dir / "stderr.txt").string(); - - // // create empty stdout.txt and stderr.txt for redirection - // if (!std::filesystem::exists(stdout_path)) - // std::ofstream(stdout_path).flush(); - // if (!std::filesystem::exists(stderr_path)) - // std::ofstream(stderr_path).flush(); - - // auto result = - // cortex::process::SpawnProcess(command, stdout_path, stderr_path); - // if (result.has_error()) { - // throw std::runtime_error(result.error()); - // } - - // PythonSubprocess py_proc; - // py_proc.proc_info = result.value(); - // py_proc.port = py_cfg.port; - // py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() / - // std::chrono::milliseconds(1); - - // pid = py_proc.proc_info.pid; - - // std::unique_lock write_lock(mutex); - // model_process_map[model] = py_proc; - - // } catch (const std::exception& e) { - // auto e_msg = e.what(); - // auto [status, error] = CreateResponse(e_msg, k500InternalServerError); - // callback(std::move(status), std::move(error)); - // return; - // } - - // auto [status, res] = CreateResponse( - // "Model loaded successfully with pid: " + std::to_string(pid), k200OK); - // callback(std::move(status), std::move(res)); - - // CTL_WRN("Not implemented"); - // throw std::runtime_error("Not implemented"); + pid_t pid; + try { + namespace fs = std::filesystem; + + const auto model_path = file_manager_utils::GetCortexDataPath() / "models" / + kHuggingFaceHost / model; + + auto env_dir = python_utils::GetEnvsPath() / "vllm" / version_; + if (!fs::exists(env_dir)) + throw std::runtime_error(env_dir.string() + " does not exist"); + + int offset = 1; + for (;; offset++) { + // add this guard to prevent endless loop + if (offset >= 100) + throw std::runtime_error("Unable to find an available port"); + + if (port_offsets_.size() <= offset) + port_offsets_.push_back(false); + + // check if port is used + if (!port_offsets_[offset]) + break; + } + const int port = CORTEX_PORT + offset; + + // https://docs.astral.sh/uv/reference/cli/#uv-run + // TODO: pass more args + // TOOD: figure out how to set env vars + // TOOD: set logging config + std::vector cmd = + python_utils::BuildUvCommand("run", env_dir.string()); + cmd.push_back("vllm"); + cmd.push_back("serve"); + cmd.push_back(model_path.string()); + cmd.push_back("--port"); + cmd.push_back(std::to_string(port)); + + auto result = cortex::process::SpawnProcess(cmd); + if (result.has_error()) { + throw std::runtime_error(result.error()); + } + + python_utils::PythonSubprocess py_proc; + py_proc.proc_info = result.value(); + py_proc.port = port; + py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + + pid = py_proc.proc_info.pid; + + std::unique_lock write_lock(mutex); + model_process_map[model] = py_proc; + + } catch (const std::exception& e) { + auto e_msg = e.what(); + auto [status, error] = CreateResponse(e_msg, 500); + callback(std::move(status), std::move(error)); + return; + } + + auto [status, res] = CreateResponse( + "Model loaded successfully with pid: " + std::to_string(pid), 200); + callback(std::move(status), std::move(res)); }; void VllmEngine::UnloadModel( std::shared_ptr json_body, std::function&& callback) { - CTL_WRN("Not implemented"); - throw std::runtime_error("Not implemented"); + if (!json_body->isMember("model")) { + auto [status, error] = CreateResponse("Missing required field: model", 400); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + + // check if model has started + { + std::shared_lock read_lock(mutex); + if (model_process_map.find(model) == model_process_map.end()) { + const std::string msg = "Model " + model + " has not been loaded yet."; + auto [status, error] = CreateResponse(msg, 400); + callback(std::move(status), std::move(error)); + return; + } + } + + // we know that model has started + { + std::unique_lock write_lock(mutex); + auto proc = model_process_map[model]; + + // TODO: we can use vLLM health check endpoint + // check if subprocess is still alive + // NOTE: is this step necessary? the subprocess could have terminated + // after .IsAlive() and before .Kill() later. + if (!proc.IsAlive()) { + model_process_map.erase(model); + port_offsets_[proc.port - CORTEX_PORT] = false; // free the port + + const std::string msg = "Model " + model + " stopped running."; + auto [status, error] = CreateResponse(msg, 400); + callback(std::move(status), std::move(error)); + return; + } + + // subprocess is alive. we kill it here. + if (!model_process_map[model].Kill()) { + const std::string msg = "Unable to kill process of model " + model; + auto [status, error] = CreateResponse(msg, 500); + callback(std::move(status), std::move(error)); + return; + } + + model_process_map.erase(model); + port_offsets_[proc.port - CORTEX_PORT] = false; // free the port + } + + auto [status, res] = CreateResponse("Unload model successfully", 200); + callback(std::move(status), std::move(res)); }; void VllmEngine::GetModelStatus( diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index a6024185e..c3e073aae 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -5,12 +5,18 @@ class VllmEngine : public EngineI { private: + std::string version_; + + // port_offsets_[i] == true means cortex_port + i is used + // otherwise, cortex_port + i is not used + std::vector port_offsets_; + mutable std::shared_mutex mutex; std::unordered_map model_process_map; public: - VllmEngine() {}; + VllmEngine(); ~VllmEngine(); static std::vector GetVariants(); diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 26e4427a0..60d52846b 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -677,18 +677,23 @@ cpp::result EngineService::IsEngineVariantReady( cpp::result EngineService::GetDefaultEngineVariant(const std::string& engine) { auto ne = cortex::engine::NormalizeEngine(engine); - // current we don't support other engine - if (ne != kLlamaRepo) { - return cpp::fail("Engine " + engine + " is not supported yet!"); - } auto config = file_manager_utils::GetCortexConfig(); - auto variant = config.llamacppVariant; - auto version = config.llamacppVersion; - - if (variant.empty() || version.empty()) { - return cpp::fail("Default engine variant for " + engine + - " is not set yet!"); + std::string variant, version; + if (engine == kLlamaEngine) { + variant = config.llamacppVariant; + version = config.llamacppVersion; + if (variant.empty() || version.empty()) + return cpp::fail("Default engine version and variant for " + engine + + " is not set yet!"); + } else if (engine == kVllmEngine) { + variant = ""; + version = config.vllmVersion; + if (version.empty()) + return cpp::fail("Default engine version for " + engine + + " is not set yet!"); + } else { + return cpp::fail("Engine " + engine + " is not supported yet!"); } return DefaultEngineVariant{ @@ -789,7 +794,14 @@ cpp::result EngineService::LoadEngine( // check for vLLM engine if (engine_name == kVllmEngine) { auto engine = new VllmEngine(); - EngineI::EngineLoadOption load_opts{}; + EngineI::EngineLoadOption load_opts; + + auto result = GetDefaultEngineVariant(engine_name); + if (result.has_error()) + return cpp::fail(result.error()); + + // we set version to engine_path + load_opts.engine_path = result.value().version; engine->Load(load_opts); engines_[engine_name].engine = engine; return {}; diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index bccc5f842..d5a4c4a6f 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -322,7 +322,7 @@ cpp::result ModelService::DownloadHfModelAsync( return cpp::fail("Please delete the model before downloading again"); auto download_task = GetCloneRepoDownloadTask( - author_id, model_id, "main", {"huggingface.co", author_id, model_id}, + author_id, model_id, "main", {kHuggingFaceHost, author_id, model_id}, unique_model_id); if (download_task.has_error()) return download_task; @@ -617,13 +617,14 @@ cpp::result ModelService::StartModel( inference_svc_->LoadModel(std::make_shared(json_data)); auto status_code = status["status_code"].asInt(); - if (status == drogon::k200OK) { - return StartModelResult{true, "vLLM engine ignores all params override"}; - } else if (status == drogon::k409Conflict) { + if (status_code == drogon::k200OK) { + return StartModelResult{true, ""}; + } else if (status_code == drogon::k409Conflict) { CTL_INF("Model '" + model_handle + "' is already loaded"); - return StartModelResult{.success = true, .warning = ""}; + return StartModelResult{true, ""}; } else { - return cpp::fail("Model failed to start: " + data["message"].asString()); + return cpp::fail("Model failed to start: " + + data["message"].asString()); } } @@ -789,17 +790,23 @@ cpp::result ModelService::StopModel( bypass_stop_check_set_.end()); std::string engine_name = ""; if (!bypass_check) { - auto model_entry = db_service_->GetModelInfo(model_handle); - if (model_entry.has_error()) { - CTL_WRN("Error: " + model_entry.error()); - return cpp::fail(model_entry.error()); + auto result = db_service_->GetModelInfo(model_handle); + if (result.has_error()) { + CTL_WRN("Error: " + result.error()); + return cpp::fail(result.error()); + } + + const auto model_entry = result.value(); + if (model_entry.engine == kVllmEngine) { + engine_name = kVllmEngine; + } else { + yaml_handler.ModelConfigFromFile( + fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.path_to_model_yaml)) + .string()); + auto mc = yaml_handler.GetModelConfig(); + engine_name = mc.engine; } - yaml_handler.ModelConfigFromFile( - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string()); - auto mc = yaml_handler.GetModelConfig(); - engine_name = mc.engine; } if (bypass_check) { engine_name = kLlamaEngine; From 92261100143b80ff3d29f9a7b6d38eaa126d1af6 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 19 Mar 2025 17:33:48 +0800 Subject: [PATCH 61/73] retrieve cortex port from yaml file --- .../extensions/python-engines/vllm_engine.cc | 34 ++++++------------- .../extensions/python-engines/vllm_engine.h | 1 + 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 6dbda426b..42d38c489 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -5,33 +5,21 @@ #include "utils/system_info_utils.h" namespace { -// figure out port of current running process -const constexpr int CORTEX_PORT = 3928; - static std::pair CreateResponse( const std::string& msg, int code) { - Json::Value status, res; - const bool has_error = code != 200; - - status["is_done"] = true; - status["has_error"] = has_error; - status["is_stream"] = false; status["status_code"] = code; - - if (has_error) { - CTL_ERR(msg); - res["error"] = msg; - } else { - res["status"] = msg; - } - + status["has_error"] = code != 200; + res["message"] = msg; return {status, res}; } } // namespace -// cortex_port + 0 is always used (by cortex itself) -VllmEngine::VllmEngine() : port_offsets_{true} {} +VllmEngine::VllmEngine() + : cortex_port_{std::stoi( + file_manager_utils::GetCortexConfig().apiServerPort)}, + port_offsets_{true} // cortex_port + 0 is always used (by cortex itself) +{} VllmEngine::~VllmEngine() { // NOTE: what happens if we can't kill subprocess? @@ -108,7 +96,7 @@ void VllmEngine::LoadModel( // if model has exited, try to load model again? CTL_WRN("Model " << model << " has exited unexpectedly"); model_process_map.erase(model); - port_offsets_[proc.port - CORTEX_PORT] = false; // free the port + port_offsets_[proc.port - cortex_port_] = false; // free the port } } } @@ -137,7 +125,7 @@ void VllmEngine::LoadModel( if (!port_offsets_[offset]) break; } - const int port = CORTEX_PORT + offset; + const int port = cortex_port_ + offset; // https://docs.astral.sh/uv/reference/cli/#uv-run // TODO: pass more args @@ -212,7 +200,7 @@ void VllmEngine::UnloadModel( // after .IsAlive() and before .Kill() later. if (!proc.IsAlive()) { model_process_map.erase(model); - port_offsets_[proc.port - CORTEX_PORT] = false; // free the port + port_offsets_[proc.port - cortex_port_] = false; // free the port const std::string msg = "Model " + model + " stopped running."; auto [status, error] = CreateResponse(msg, 400); @@ -229,7 +217,7 @@ void VllmEngine::UnloadModel( } model_process_map.erase(model); - port_offsets_[proc.port - CORTEX_PORT] = false; // free the port + port_offsets_[proc.port - cortex_port_] = false; // free the port } auto [status, res] = CreateResponse("Unload model successfully", 200); diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index c3e073aae..c41d7de4a 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -6,6 +6,7 @@ class VllmEngine : public EngineI { private: std::string version_; + int cortex_port_; // port_offsets_[i] == true means cortex_port + i is used // otherwise, cortex_port + i is not used From eeccd3a8adb4ec2d30824eb5a37f40cf21ec5bf1 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Mar 2025 11:19:48 +0800 Subject: [PATCH 62/73] add env vars support. log stdout and stderr --- .../extensions/python-engines/vllm_engine.cc | 15 ++++- engine/services/engine_service.cc | 10 +-- engine/utils/process/utils.cc | 64 ++++++++++++++++++- engine/utils/process/utils.h | 7 +- 4 files changed, 86 insertions(+), 10 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 42d38c489..23fa85ed6 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -1,4 +1,5 @@ #include "vllm_engine.h" +#include #include "services/engine_service.h" #include "utils/curl_utils.h" #include "utils/logging_utils.h" @@ -138,8 +139,20 @@ void VllmEngine::LoadModel( cmd.push_back(model_path.string()); cmd.push_back("--port"); cmd.push_back(std::to_string(port)); + cmd.push_back("--served-model-name"); + cmd.push_back(model); - auto result = cortex::process::SpawnProcess(cmd); + const auto stdout_file = env_dir / "stdout.log"; + const auto stderr_file = env_dir / "stderr.log"; + + // create empty files for redirection + if (!std::filesystem::exists(stdout_file)) + std::ofstream(stdout_file).flush(); + if (!std::filesystem::exists(stderr_file)) + std::ofstream(stderr_file).flush(); + + auto result = cortex::process::SpawnProcess(cmd, stdout_file.string(), + stderr_file.string()); if (result.has_error()) { throw std::runtime_error(result.error()); } diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 60d52846b..ec675ffde 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -406,16 +406,16 @@ cpp::result EngineService::DownloadVllm( concrete_version = version_value.asString(); } CTL_INF("Download vLLM " << concrete_version); + namespace fs = std::filesystem; const auto vllm_path = python_utils::GetEnvsPath() / "vllm" / concrete_version; - std::filesystem::create_directories(vllm_path); - const auto vllm_path_str = vllm_path.string(); + fs::create_directories(vllm_path); // initialize venv - if (!std::filesystem::exists(vllm_path / ".venv")) { + if (!fs::exists(vllm_path / ".venv")) { std::vector cmd = - python_utils::BuildUvCommand("venv", vllm_path_str); + python_utils::BuildUvCommand("venv", vllm_path.string()); cmd.push_back("--relocatable"); auto result = cortex::process::SpawnProcess(cmd); if (result.has_error()) @@ -429,7 +429,7 @@ cpp::result EngineService::DownloadVllm( // install vLLM { std::vector cmd = - python_utils::BuildUvCommand("pip", vllm_path_str); + python_utils::BuildUvCommand("pip", vllm_path.string()); cmd.push_back("install"); cmd.push_back("vllm==" + concrete_version); auto result = cortex::process::SpawnProcess(cmd); diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index 8cd0adc64..ac90b1c09 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -11,6 +11,44 @@ extern char** environ; // environment variables #include #endif +namespace { +// retrieve current env vars, make a copy, then add new env vars from input +std::vector BuildEnvVars( + const std::unordered_map& new_env_vars) { +#if defined(_WIN32) + throw std::runtime_error("Not implemented"); +#endif + + // parse current env var to an unordered map + std::unordered_map env_vars_map; + for (int i = 0; environ[i] != nullptr; i++) { + std::string env_var{environ[i]}; + auto split_idx = env_var.find("="); + + if (split_idx == std::string::npos) { + throw std::runtime_error( + "Error while parsing current environment variables"); + } + + env_vars_map[env_var.substr(0, split_idx)] = env_var.substr(split_idx + 1); + } + + // add new env vars. it will override existing env vars + for (const auto& [key, value] : new_env_vars) { + env_vars_map[key] = value; + } + + // convert back to key=value format + std::vector env_vars_vector; + for (const auto& [key, value] : env_vars_map) { + env_vars_vector.push_back(key + "=" + value); + } + + return env_vars_vector; +} + +} // namespace + namespace cortex::process { std::string ConstructWindowsCommandLine(const std::vector& args) { @@ -42,7 +80,10 @@ std::vector ConvertToArgv(const std::vector& args) { cpp::result SpawnProcess( const std::vector& command, const std::string& stdout_file, - const std::string& stderr_file) { + const std::string& stderr_file, + std::optional>> + env_vars) { std::stringstream ss; for (const auto item : command) { ss << item << " "; @@ -191,6 +232,8 @@ cpp::result SpawnProcess( posix_spawn_file_actions_destroy(action_ptr); throw std::runtime_error("Unable to add stdout to file action"); } + } else { + CTL_WRN(stdout_file + " does not exist"); } } @@ -203,18 +246,33 @@ cpp::result SpawnProcess( posix_spawn_file_actions_destroy(action_ptr); throw std::runtime_error("Unable to add stderr to file action"); } + } else { + CTL_WRN(stderr_file + " does not exist"); } } } + char** envp; + // we put these 2 here so that its lifetime lasts entire function + std::vector env_vars_vector; + std::vector env_vars_; + if (env_vars.has_value()) { + env_vars_vector = BuildEnvVars(env_vars.value()); + env_vars_ = ConvertToArgv(env_vars_vector); + envp = env_vars_.data(); + } else { + envp = environ; // simply inherit current env + } + // Use posix_spawn for cross-platform compatibility + // NOTE: posix_spawn() returns after fork() step. it means that we may + // need to keep argv and envp data alive until exec() step finishes. auto spawn_result = posix_spawn(&pid, // pid output command[0].c_str(), // executable path action_ptr, // file actions NULL, // spawn attributes argv.data(), // argument vector - environ // environment (inherit) - ); + envp); // environment // NOTE: it seems like it's ok to destroy this immediately before // subprocess terminates. diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h index 19b821cef..db1ac7460 100644 --- a/engine/utils/process/utils.h +++ b/engine/utils/process/utils.h @@ -12,7 +12,9 @@ using pid_t = DWORD; #include #endif +#include #include +#include #include #include "utils/result.hpp" @@ -36,7 +38,10 @@ std::vector ConvertToArgv(const std::vector& args); cpp::result SpawnProcess( const std::vector& command, - const std::string& stdout_file = "", const std::string& stderr_file = ""); + const std::string& stdout_file = "", const std::string& stderr_file = "", + std::optional>> + env_vars = {}); bool IsProcessAlive(ProcessInfo& proc_info); bool WaitProcess(ProcessInfo& proc_info); bool KillProcess(ProcessInfo& proc_info); From 6fe7ae877433799aa9094e6b4d07f8aedb386434 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Mar 2025 11:41:02 +0800 Subject: [PATCH 63/73] add GetModelStatus and GetModels --- .../extensions/python-engines/vllm_engine.cc | 86 ++++++++++++++++--- 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 23fa85ed6..fc5603281 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -129,9 +129,6 @@ void VllmEngine::LoadModel( const int port = cortex_port_ + offset; // https://docs.astral.sh/uv/reference/cli/#uv-run - // TODO: pass more args - // TOOD: figure out how to set env vars - // TOOD: set logging config std::vector cmd = python_utils::BuildUvCommand("run", env_dir.string()); cmd.push_back("vllm"); @@ -146,11 +143,13 @@ void VllmEngine::LoadModel( const auto stderr_file = env_dir / "stderr.log"; // create empty files for redirection + // TODO: add limit on file size? if (!std::filesystem::exists(stdout_file)) std::ofstream(stdout_file).flush(); if (!std::filesystem::exists(stderr_file)) std::ofstream(stderr_file).flush(); + // TODO: may want to wait until model is ready i.e. health check endpoint auto result = cortex::process::SpawnProcess(cmd, stdout_file.string(), stderr_file.string()); if (result.has_error()) { @@ -240,21 +239,89 @@ void VllmEngine::UnloadModel( void VllmEngine::GetModelStatus( std::shared_ptr json_body, std::function&& callback) { - CTL_WRN("Not implemented"); - throw std::runtime_error("Not implemented"); + + if (!json_body->isMember("model")) { + auto [status, error] = CreateResponse("Missing required field: model", 400); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + // check if model has started + { + std::shared_lock read_lock(mutex); + if (model_process_map.find(model) == model_process_map.end()) { + const std::string msg = "Model " + model + " has not been loaded yet."; + auto [status, error] = CreateResponse(msg, 400); + callback(std::move(status), std::move(error)); + return; + } + } + + // we know that model has started + // TODO: just use health check endpoint + { + std::unique_lock write_lock(mutex); + + // check if subprocess is still alive + if (!model_process_map[model].IsAlive()) { + CTL_WRN("Model " << model << " has exited unexpectedly."); + model_process_map.erase(model); + const std::string msg = "Model " + model + " stopped running."; + auto [status, error] = CreateResponse(msg, 400); + callback(std::move(status), std::move(error)); + return; + } + } + + Json::Value res, status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(res)); }; -// For backward compatible checking bool VllmEngine::IsSupported(const std::string& f) { return true; }; -// Get list of running models void VllmEngine::GetModels( std::shared_ptr jsonBody, std::function&& callback) { - CTL_WRN("Not implemented"); - throw std::runtime_error("Not implemented"); + Json::Value res, model_list(Json::arrayValue), status; + { + std::unique_lock write_lock(mutex); + for (auto& [model_name, py_proc] : model_process_map) { + // TODO: check using health endpoint + if (!py_proc.IsAlive()) { + CTL_WRN("Model " << model_name << " has exited unexpectedly."); + model_process_map.erase(model_name); + continue; + } + + Json::Value val; + val["id"] = model_name; + val["engine"] = kVllmEngine; + val["start_time"] = py_proc.start_time; + val["port"] = py_proc.port; + val["object"] = "model"; + // TODO + // val["ram"]; + // val["vram"]; + model_list.append(val); + } + } + + res["object"] = "list"; + res["data"] = model_list; + + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + + callback(std::move(status), std::move(res)); }; bool VllmEngine::SetFileLogger(int max_log_lines, const std::string& log_path) { @@ -266,7 +333,6 @@ void VllmEngine::SetLogLevel(trantor::Logger::LogLevel logLevel) { throw std::runtime_error("Not implemented"); }; -// Stop inflight chat completion in stream mode void VllmEngine::StopInferencing(const std::string& model_id) { CTL_WRN("Not implemented"); throw std::runtime_error("Not implemented"); From 074a04a003bb2836841b1baf1c83549e4846c44e Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Thu, 20 Mar 2025 11:54:01 +0800 Subject: [PATCH 64/73] fix typo --- engine/services/engine_service.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index ec675ffde..c80925616 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -680,7 +680,7 @@ EngineService::GetDefaultEngineVariant(const std::string& engine) { auto config = file_manager_utils::GetCortexConfig(); std::string variant, version; - if (engine == kLlamaEngine) { + if (engine == kLlamaRepo) { variant = config.llamacppVariant; version = config.llamacppVersion; if (variant.empty() || version.empty()) From 368a4f3188d3375cf76929b0eb87aaa7f4dbfc56 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 21 Mar 2025 16:35:45 +0800 Subject: [PATCH 65/73] add non-stream chat completions --- .../extensions/python-engines/vllm_engine.cc | 97 +++++++++++++------ .../extensions/python-engines/vllm_engine.h | 4 +- engine/services/model_service.cc | 4 + 3 files changed, 76 insertions(+), 29 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index fc5603281..3946d2717 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -24,8 +24,8 @@ VllmEngine::VllmEngine() VllmEngine::~VllmEngine() { // NOTE: what happens if we can't kill subprocess? - std::unique_lock write_lock(mutex); - for (auto& [model_name, py_proc] : model_process_map) { + std::unique_lock write_lock(mutex_); + for (auto& [model_name, py_proc] : model_process_map_) { if (py_proc.IsAlive()) py_proc.Kill(); } @@ -60,15 +60,58 @@ void VllmEngine::Unload(EngineUnloadOption opts) {}; void VllmEngine::HandleChatCompletion( std::shared_ptr json_body, std::function&& callback) { - CTL_WRN("Not implemented"); - throw std::runtime_error("Not implemented"); + + // request validation should be in controller + if (!json_body->isMember("model")) { + auto [status, error] = + CreateResponse("Missing required fields: model", 400); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + int port; + // check if model has started + // TODO: use health check instead + { + std::shared_lock read_lock(mutex_); + if (model_process_map_.find(model) == model_process_map_.end()) { + const std::string msg = "Model " + model + " has not been loaded yet."; + auto [status, error] = CreateResponse(msg, 400); + callback(std::move(status), std::move(error)); + return; + } + port = model_process_map_[model].port; + } + + bool stream = (*json_body)["stream"].asBool(); + if (stream) { + auto [status, res] = CreateResponse("stream=true is not yet supported", 400); + callback(std::move(status), std::move(res)); + } else { + const std::string url = + "http://127.0.0.1:" + std::to_string(port) + "/v1/chat/completions"; + auto result = curl_utils::SimplePostJson(url, json_body->toStyledString()); + + if (result.has_error()) { + auto [status, res] = CreateResponse(result.error(), 400); + callback(std::move(status), std::move(res)); + } + + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(result.value())); + } }; void VllmEngine::HandleEmbedding( std::shared_ptr json_body, std::function&& callback) { - CTL_WRN("Not implemented"); - throw std::runtime_error("Not implemented"); + auto [status, res] = CreateResponse("embedding is not yet supported", 400); + callback(std::move(status), std::move(res)); }; void VllmEngine::LoadModel( @@ -85,9 +128,9 @@ void VllmEngine::LoadModel( const std::string model = (*json_body)["model"].asString(); { - std::unique_lock write_lock(mutex); - if (model_process_map.find(model) != model_process_map.end()) { - auto proc = model_process_map[model]; + std::unique_lock write_lock(mutex_); + if (model_process_map_.find(model) != model_process_map_.end()) { + auto proc = model_process_map_[model]; if (proc.IsAlive()) { auto [status, error] = CreateResponse("Model already loaded!", 409); @@ -96,7 +139,7 @@ void VllmEngine::LoadModel( } else { // if model has exited, try to load model again? CTL_WRN("Model " << model << " has exited unexpectedly"); - model_process_map.erase(model); + model_process_map_.erase(model); port_offsets_[proc.port - cortex_port_] = false; // free the port } } @@ -164,8 +207,8 @@ void VllmEngine::LoadModel( pid = py_proc.proc_info.pid; - std::unique_lock write_lock(mutex); - model_process_map[model] = py_proc; + std::unique_lock write_lock(mutex_); + model_process_map_[model] = py_proc; } catch (const std::exception& e) { auto e_msg = e.what(); @@ -192,8 +235,8 @@ void VllmEngine::UnloadModel( // check if model has started { - std::shared_lock read_lock(mutex); - if (model_process_map.find(model) == model_process_map.end()) { + std::shared_lock read_lock(mutex_); + if (model_process_map_.find(model) == model_process_map_.end()) { const std::string msg = "Model " + model + " has not been loaded yet."; auto [status, error] = CreateResponse(msg, 400); callback(std::move(status), std::move(error)); @@ -203,15 +246,15 @@ void VllmEngine::UnloadModel( // we know that model has started { - std::unique_lock write_lock(mutex); - auto proc = model_process_map[model]; + std::unique_lock write_lock(mutex_); + auto proc = model_process_map_[model]; // TODO: we can use vLLM health check endpoint // check if subprocess is still alive // NOTE: is this step necessary? the subprocess could have terminated // after .IsAlive() and before .Kill() later. if (!proc.IsAlive()) { - model_process_map.erase(model); + model_process_map_.erase(model); port_offsets_[proc.port - cortex_port_] = false; // free the port const std::string msg = "Model " + model + " stopped running."; @@ -221,14 +264,14 @@ void VllmEngine::UnloadModel( } // subprocess is alive. we kill it here. - if (!model_process_map[model].Kill()) { + if (!model_process_map_[model].Kill()) { const std::string msg = "Unable to kill process of model " + model; auto [status, error] = CreateResponse(msg, 500); callback(std::move(status), std::move(error)); return; } - model_process_map.erase(model); + model_process_map_.erase(model); port_offsets_[proc.port - cortex_port_] = false; // free the port } @@ -249,8 +292,8 @@ void VllmEngine::GetModelStatus( const std::string model = (*json_body)["model"].asString(); // check if model has started { - std::shared_lock read_lock(mutex); - if (model_process_map.find(model) == model_process_map.end()) { + std::shared_lock read_lock(mutex_); + if (model_process_map_.find(model) == model_process_map_.end()) { const std::string msg = "Model " + model + " has not been loaded yet."; auto [status, error] = CreateResponse(msg, 400); callback(std::move(status), std::move(error)); @@ -261,12 +304,12 @@ void VllmEngine::GetModelStatus( // we know that model has started // TODO: just use health check endpoint { - std::unique_lock write_lock(mutex); + std::unique_lock write_lock(mutex_); // check if subprocess is still alive - if (!model_process_map[model].IsAlive()) { + if (!model_process_map_[model].IsAlive()) { CTL_WRN("Model " << model << " has exited unexpectedly."); - model_process_map.erase(model); + model_process_map_.erase(model); const std::string msg = "Model " + model + " stopped running."; auto [status, error] = CreateResponse(msg, 400); callback(std::move(status), std::move(error)); @@ -291,12 +334,12 @@ void VllmEngine::GetModels( std::function&& callback) { Json::Value res, model_list(Json::arrayValue), status; { - std::unique_lock write_lock(mutex); - for (auto& [model_name, py_proc] : model_process_map) { + std::unique_lock write_lock(mutex_); + for (auto& [model_name, py_proc] : model_process_map_) { // TODO: check using health endpoint if (!py_proc.IsAlive()) { CTL_WRN("Model " << model_name << " has exited unexpectedly."); - model_process_map.erase(model_name); + model_process_map_.erase(model_name); continue; } diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index c41d7de4a..b13255fe3 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -12,9 +12,9 @@ class VllmEngine : public EngineI { // otherwise, cortex_port + i is not used std::vector port_offsets_; - mutable std::shared_mutex mutex; + mutable std::shared_mutex mutex_; std::unordered_map - model_process_map; + model_process_map_; public: VllmEngine(); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index c75ed7504..accc9787e 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -1252,6 +1252,10 @@ std::string ModelService::GetEngineByModelId( CTL_WRN("Error: " + model_entry.error()); return ""; } + + if (model_entry.value().engine == kVllmEngine) + return kVllmEngine; + config::YamlHandler yaml_handler; yaml_handler.ModelConfigFromFile( fmu::ToAbsoluteCortexDataPath( From 807b201831a845d24f86579b402c7b98f625d441 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 1 Apr 2025 19:45:21 +0800 Subject: [PATCH 66/73] add uninstall cmd --- .../extensions/python-engines/python_utils.cc | 45 ++++++------------- .../extensions/python-engines/python_utils.h | 9 ++-- engine/services/engine_service.cc | 20 ++++++--- 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc index 07297801e..005c36b7c 100644 --- a/engine/extensions/python-engines/python_utils.cc +++ b/engine/extensions/python-engines/python_utils.cc @@ -20,11 +20,21 @@ std::filesystem::path GetUvPath() { const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv"; return GetPythonEnginesPath() / "bin" / bin_name; } +bool UvCleanCache() { + auto cmd = UvBuildCommand("cache"); + cmd.push_back("clean"); + auto result = cortex::process::SpawnProcess(cmd); + if (result.has_error()) { + CTL_INF(result.error()); + return false; + } + return cortex::process::WaitProcess(result.value()); +} -bool IsUvInstalled() { +bool UvIsInstalled() { return std::filesystem::exists(GetUvPath()); } -cpp::result InstallUv() { +cpp::result UvInstall() { const auto py_bin_path = GetPythonEnginesPath() / "bin"; std::filesystem::create_directories(py_bin_path); @@ -75,7 +85,7 @@ cpp::result InstallUv() { // this Python installation. // we can add this once we allow passing custom env var to SpawnProcess(). // https://docs.astral.sh/uv/reference/cli/#uv-python-install - std::vector command = BuildUvCommand("python"); + std::vector command = UvBuildCommand("python"); command.push_back("install"); command.push_back("3.10"); @@ -92,7 +102,7 @@ cpp::result InstallUv() { return {}; } -std::vector BuildUvCommand(const std::string& action, +std::vector UvBuildCommand(const std::string& action, const std::string& directory) { // use our own cache dir so that when users delete cortexcpp/, everything is deleted. const auto cache_dir = GetPythonEnginesPath() / "cache" / "uv"; @@ -106,31 +116,4 @@ std::vector BuildUvCommand(const std::string& action, return command; } -// cpp::result UvDownloadDeps( -// const std::filesystem::path& model_dir) { -// if (!IsUvInstalled()) -// return cpp::fail( -// "uv is not installed. Please run `cortex engines install python`."); - -// std::vector command = BuildUvCommand("sync", model_dir.string()); - -// // script mode. 1st argument is path to .py script -// if (!std::filesystem::exists(model_dir / "pyproject.toml")) { -// config::PythonModelConfig py_cfg; -// py_cfg.ReadFromYaml((model_dir / "model.yml").string()); -// command.push_back("--script"); -// command.push_back(py_cfg.entrypoint[0]); -// } - -// auto result = cortex::process::SpawnProcess(command); -// if (result.has_error()) -// return cpp::fail("Fail to install Python dependencies. " + result.error()); - -// if (!cortex::process::WaitProcess(result.value())) { -// return cpp::fail("Fail to install Python dependencies."); -// } - -// return {}; -// } - } // namespace python_utils diff --git a/engine/extensions/python-engines/python_utils.h b/engine/extensions/python-engines/python_utils.h index 97b2d3f15..5206eb7f1 100644 --- a/engine/extensions/python-engines/python_utils.h +++ b/engine/extensions/python-engines/python_utils.h @@ -14,12 +14,11 @@ std::filesystem::path GetEnvsPath(); std::filesystem::path GetUvPath(); // UV-related functions -bool IsUvInstalled(); -cpp::result InstallUv(); -std::vector BuildUvCommand(const std::string& action, +bool UvIsInstalled(); +cpp::result UvInstall(); +std::vector UvBuildCommand(const std::string& action, const std::string& directory = ""); -// cpp::result UvDownloadDeps( -// const std::filesystem::path& yaml_path); +bool UvCleanCache(); struct PythonSubprocess { cortex::process::ProcessInfo proc_info; diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index e0205919f..4da119c3d 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -201,6 +201,16 @@ cpp::result EngineService::UninstallEngineVariant( } else { return cpp::fail("No variant provided"); } + } else if (ne == kVllmEngine) { + // variant is ignored for vLLM + if (version == std::nullopt) { + path_to_remove = python_utils::GetEnvsPath() / "vllm"; + + // we only clean uv cache when all vLLM versions are deleted + python_utils::UvCleanCache(); + } else { + path_to_remove = python_utils::GetEnvsPath() / "vllm" / version.value(); + } } else { return cpp::fail("Not implemented for engine " + ne); } @@ -394,8 +404,8 @@ cpp::result EngineService::DownloadVllm( // NOTE: everything below is not async // to make it async, we have to run everything in a thread (spawning and waiting // for subprocesses) - if (!python_utils::IsUvInstalled()) { - auto result = python_utils::InstallUv(); + if (!python_utils::UvIsInstalled()) { + auto result = python_utils::UvInstall(); if (result.has_error()) return result; } @@ -421,21 +431,20 @@ cpp::result EngineService::DownloadVllm( // initialize venv if (!fs::exists(vllm_path / ".venv")) { std::vector cmd = - python_utils::BuildUvCommand("venv", vllm_path.string()); + python_utils::UvBuildCommand("venv", vllm_path.string()); cmd.push_back("--relocatable"); auto result = cortex::process::SpawnProcess(cmd); if (result.has_error()) return cpp::fail(result.error()); // TODO: check return code - // NOTE: these are not async cortex::process::WaitProcess(result.value()); } // install vLLM { std::vector cmd = - python_utils::BuildUvCommand("pip", vllm_path.string()); + python_utils::UvBuildCommand("pip", vllm_path.string()); cmd.push_back("install"); cmd.push_back("vllm==" + concrete_version); auto result = cortex::process::SpawnProcess(cmd); @@ -444,7 +453,6 @@ cpp::result EngineService::DownloadVllm( // TODO: check return code // one reason this may fail is that the requested version does not exist - // NOTE: these are not async cortex::process::WaitProcess(result.value()); } From d38eca8a47d295a4f6c640d2dc942c0866ec66cd Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 1 Apr 2025 21:35:27 +0800 Subject: [PATCH 67/73] support streaming --- .../extensions/python-engines/vllm_engine.cc | 98 +++++++++++++++++-- .../extensions/python-engines/vllm_engine.h | 4 + engine/services/engine_service.cc | 1 + 3 files changed, 95 insertions(+), 8 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 3946d2717..9564c13a4 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -14,13 +14,47 @@ static std::pair CreateResponse( res["message"] = msg; return {status, res}; } + +// this is mostly copied from local_engine.cc +struct StreamContext { + std::shared_ptr> callback; + bool need_stop; + + static size_t write_callback(char* ptr, size_t size, size_t nmemb, + void* userdata) { + auto* ctx = static_cast(userdata); + size_t data_length = size * nmemb; + if (data_length <= 6) + return data_length; + + std::string chunk{ptr, data_length}; + CTL_INF(chunk); + Json::Value status; + status["is_stream"] = true; + status["has_error"] = false; + status["status_code"] = 200; + Json::Value chunk_json; + chunk_json["data"] = chunk; + + if (chunk.find("[DONE]") != std::string::npos) { + status["is_done"] = true; + ctx->need_stop = false; + } else { + status["is_done"] = false; + } + + (*ctx->callback)(std::move(status), std::move(chunk_json)); + return data_length; + }; +}; + } // namespace VllmEngine::VllmEngine() : cortex_port_{std::stoi( file_manager_utils::GetCortexConfig().apiServerPort)}, - port_offsets_{true} // cortex_port + 0 is always used (by cortex itself) -{} + port_offsets_{true}, // cortex_port + 0 is always used (by cortex itself) + queue_{2 /* threadNum */, "vLLM engine"} {} VllmEngine::~VllmEngine() { // NOTE: what happens if we can't kill subprocess? @@ -84,14 +118,62 @@ void VllmEngine::HandleChatCompletion( port = model_process_map_[model].port; } + const std::string url = + "http://127.0.0.1:" + std::to_string(port) + "/v1/chat/completions"; + const std::string json_str = json_body->toStyledString(); + bool stream = (*json_body)["stream"].asBool(); if (stream) { - auto [status, res] = CreateResponse("stream=true is not yet supported", 400); - callback(std::move(status), std::move(res)); + queue_.runTaskInQueue([url = std::move(url), json_str = std::move(json_str), + callback = std::move(callback)] { + CURL* curl = curl_easy_init(); + if (!curl) { + auto [status, res] = CreateResponse("Internal server error", 500); + callback(std::move(status), std::move(res)); + } + + struct curl_slist* headers = nullptr; + headers = curl_slist_append(headers, "Content-Type: application/json"); + + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length()); + curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L); + + StreamContext ctx; + ctx.callback = + std::make_shared>( + callback); + ctx.need_stop = true; + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, + StreamContext::write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &ctx); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + auto msg = curl_easy_strerror(res); + auto [status, res] = CreateResponse(msg, 500); + callback(std::move(status), std::move(res)); + } + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + if (ctx.need_stop) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + callback(std::move(status), Json::Value{}); + } + + return; + }); } else { - const std::string url = - "http://127.0.0.1:" + std::to_string(port) + "/v1/chat/completions"; - auto result = curl_utils::SimplePostJson(url, json_body->toStyledString()); + // non-streaming + auto result = curl_utils::SimplePostJson(url, json_str); if (result.has_error()) { auto [status, res] = CreateResponse(result.error(), 400); @@ -173,7 +255,7 @@ void VllmEngine::LoadModel( // https://docs.astral.sh/uv/reference/cli/#uv-run std::vector cmd = - python_utils::BuildUvCommand("run", env_dir.string()); + python_utils::UvBuildCommand("run", env_dir.string()); cmd.push_back("vllm"); cmd.push_back("serve"); cmd.push_back(model_path.string()); diff --git a/engine/extensions/python-engines/vllm_engine.h b/engine/extensions/python-engines/vllm_engine.h index b13255fe3..d7724b703 100644 --- a/engine/extensions/python-engines/vllm_engine.h +++ b/engine/extensions/python-engines/vllm_engine.h @@ -2,6 +2,7 @@ #include "common/engine_servicei.h" #include "cortex-common/EngineI.h" #include "python_utils.h" +#include "trantor/utils/ConcurrentTaskQueue.h" class VllmEngine : public EngineI { private: @@ -16,6 +17,9 @@ class VllmEngine : public EngineI { std::unordered_map model_process_map_; + // TODO: will use cortex's main TaskQueue once llama.cpp PR is merged + trantor::ConcurrentTaskQueue queue_; + public: VllmEngine(); ~VllmEngine(); diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 4da119c3d..9df6b74a2 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -433,6 +433,7 @@ cpp::result EngineService::DownloadVllm( std::vector cmd = python_utils::UvBuildCommand("venv", vllm_path.string()); cmd.push_back("--relocatable"); + cmd.push_back("--seed"); auto result = cortex::process::SpawnProcess(cmd); if (result.has_error()) return cpp::fail(result.error()); From 7e002cd4a11b3d2572d819194e76b58996ba9aab Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Tue, 1 Apr 2025 21:52:04 +0800 Subject: [PATCH 68/73] fix cortex run --- engine/cli/commands/chat_completion_cmd.cc | 6 +++++- engine/cli/commands/run_cmd.cc | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/engine/cli/commands/chat_completion_cmd.cc b/engine/cli/commands/chat_completion_cmd.cc index 77ee4fca3..6b52464f3 100644 --- a/engine/cli/commands/chat_completion_cmd.cc +++ b/engine/cli/commands/chat_completion_cmd.cc @@ -137,7 +137,11 @@ void ChatCompletionCmd::Exec(const std::string& host, int port, new_data["content"] = user_input; histories_.push_back(std::move(new_data)); - Json::Value json_data = mc.ToJson(); + // vLLM doesn't support params used model config + Json::Value json_data; + if (mc.engine != kVllmEngine) { + json_data = mc.ToJson(); + } json_data["engine"] = mc.engine; Json::Value msgs_array(Json::arrayValue); diff --git a/engine/cli/commands/run_cmd.cc b/engine/cli/commands/run_cmd.cc index c01d3d806..25f3ae45d 100644 --- a/engine/cli/commands/run_cmd.cc +++ b/engine/cli/commands/run_cmd.cc @@ -84,11 +84,18 @@ void RunCmd::Exec(bool run_detach, CLI_LOG("Error: " + model_entry.error()); return; } - yaml_handler.ModelConfigFromFile( - fmu::ToAbsoluteCortexDataPath( - fs::path(model_entry.value().path_to_model_yaml)) - .string()); - auto mc = yaml_handler.GetModelConfig(); + + config::ModelConfig mc; + if (model_entry.value().engine == kVllmEngine) { + // vLLM engine doesn't have model config + mc.engine = kVllmEngine; + } else { + yaml_handler.ModelConfigFromFile( + fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.value().path_to_model_yaml)) + .string()); + mc = yaml_handler.GetModelConfig(); + } // Check if engine existed. If not, download it { From 1ebbbdb8cb0aaa70c14f6a644716a5b924fdc192 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 2 Apr 2025 20:31:49 +0800 Subject: [PATCH 69/73] wait for vLLM server to be up --- .../extensions/python-engines/vllm_engine.cc | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 9564c13a4..e8192e569 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -280,15 +280,27 @@ void VllmEngine::LoadModel( if (result.has_error()) { throw std::runtime_error(result.error()); } + auto proc_info = result.value(); + pid = proc_info.pid; + + // wait for server to be up + while (true) { + CTL_INF("Wait for vLLM server to be up. Sleep for 5s"); + std::this_thread::sleep_for(std::chrono::seconds(5)); + if (!cortex::process::IsProcessAlive(proc_info)) + throw std::runtime_error("vLLM subprocess fails to start"); + + const auto url = "http://127.0.0.1:" + std::to_string(port) + "/health"; + if (curl_utils::SimpleGet(url).has_value()) + break; + } python_utils::PythonSubprocess py_proc; - py_proc.proc_info = result.value(); + py_proc.proc_info = proc_info; py_proc.port = port; py_proc.start_time = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1); - pid = py_proc.proc_info.pid; - std::unique_lock write_lock(mutex_); model_process_map_[model] = py_proc; From b5d83156cc63b5f44a0ebc5f57ae1f709de86fda Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 2 Apr 2025 20:45:52 +0800 Subject: [PATCH 70/73] use health check for some stuff --- .../extensions/python-engines/vllm_engine.cc | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index e8192e569..3cc30e37f 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -106,7 +106,6 @@ void VllmEngine::HandleChatCompletion( const std::string model = (*json_body)["model"].asString(); int port; // check if model has started - // TODO: use health check instead { std::shared_lock read_lock(mutex_); if (model_process_map_.find(model) == model_process_map_.end()) { @@ -274,7 +273,6 @@ void VllmEngine::LoadModel( if (!std::filesystem::exists(stderr_file)) std::ofstream(stderr_file).flush(); - // TODO: may want to wait until model is ready i.e. health check endpoint auto result = cortex::process::SpawnProcess(cmd, stdout_file.string(), stderr_file.string()); if (result.has_error()) { @@ -284,6 +282,7 @@ void VllmEngine::LoadModel( pid = proc_info.pid; // wait for server to be up + // NOTE: should we add a timeout to avoid endless loop? while (true) { CTL_INF("Wait for vLLM server to be up. Sleep for 5s"); std::this_thread::sleep_for(std::chrono::seconds(5)); @@ -343,7 +342,6 @@ void VllmEngine::UnloadModel( std::unique_lock write_lock(mutex_); auto proc = model_process_map_[model]; - // TODO: we can use vLLM health check endpoint // check if subprocess is still alive // NOTE: is this step necessary? the subprocess could have terminated // after .IsAlive() and before .Kill() later. @@ -396,27 +394,32 @@ void VllmEngine::GetModelStatus( } // we know that model has started - // TODO: just use health check endpoint { std::unique_lock write_lock(mutex_); + auto py_proc = model_process_map_[model]; + + // health check endpoint + const auto url = + "http://127.0.0.1:" + std::to_string(py_proc.port) + "/health"; + if (curl_utils::SimpleGet(url).has_value()) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), Json::Value{}); + } else { + // try to kill the subprocess to free resources, in case the server hangs + // instead of subprocess has died. + py_proc.Kill(); - // check if subprocess is still alive - if (!model_process_map_[model].IsAlive()) { CTL_WRN("Model " << model << " has exited unexpectedly."); model_process_map_.erase(model); const std::string msg = "Model " + model + " stopped running."; auto [status, error] = CreateResponse(msg, 400); callback(std::move(status), std::move(error)); - return; } } - - Json::Value res, status; - status["is_done"] = true; - status["has_error"] = false; - status["is_stream"] = false; - status["status_code"] = 200; - callback(std::move(status), std::move(res)); }; bool VllmEngine::IsSupported(const std::string& f) { @@ -430,8 +433,13 @@ void VllmEngine::GetModels( { std::unique_lock write_lock(mutex_); for (auto& [model_name, py_proc] : model_process_map_) { - // TODO: check using health endpoint - if (!py_proc.IsAlive()) { + const auto url = + "http://127.0.0.1:" + std::to_string(py_proc.port) + "/health"; + if (curl_utils::SimpleGet(url).has_error()) { + // try to kill the subprocess to free resources, in case the server hangs + // instead of subprocess has died. + py_proc.Kill(); + CTL_WRN("Model " << model_name << " has exited unexpectedly."); model_process_map_.erase(model_name); continue; From 5feda51361d7cfe0baab558e0b8582cd72fe5ce3 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 2 Apr 2025 21:19:48 +0800 Subject: [PATCH 71/73] add some notes. support embeddings. support some extra vLLM args --- .../extensions/python-engines/vllm_engine.cc | 65 ++++++++++++++++++- engine/services/inference_service.cc | 4 +- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 3cc30e37f..5bdab068a 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -1,3 +1,9 @@ +// Note on subprocess lifecycle +// In LoadModel(), we will wait until /health returns 200. Thus, in subsequent +// calls to the subprocess, if the server is working normally, /health is +// guaranteed to return 200. If it doesn't, it either means the subprocess has +// died or the server hangs (for whatever reason). + #include "vllm_engine.h" #include #include "services/engine_service.h" @@ -82,6 +88,7 @@ std::vector VllmEngine::GetVariants() { return variants; } +// TODO: once llama-server is merged, check if checking 'v' is still needed void VllmEngine::Load(EngineLoadOption opts) { version_ = opts.engine_path; // engine path actually contains version info if (version_[0] == 'v') @@ -95,7 +102,7 @@ void VllmEngine::HandleChatCompletion( std::shared_ptr json_body, std::function&& callback) { - // request validation should be in controller + // NOTE: request validation should be in controller if (!json_body->isMember("model")) { auto [status, error] = CreateResponse("Missing required fields: model", 400); @@ -188,11 +195,49 @@ void VllmEngine::HandleChatCompletion( } }; +// NOTE: we don't have an option to pass --task embed to vLLM spawn yet void VllmEngine::HandleEmbedding( std::shared_ptr json_body, std::function&& callback) { - auto [status, res] = CreateResponse("embedding is not yet supported", 400); - callback(std::move(status), std::move(res)); + + if (!json_body->isMember("model")) { + auto [status, error] = + CreateResponse("Missing required fields: model", 400); + callback(std::move(status), std::move(error)); + return; + } + + const std::string model = (*json_body)["model"].asString(); + int port; + // check if model has started + { + std::shared_lock read_lock(mutex_); + if (model_process_map_.find(model) == model_process_map_.end()) { + const std::string msg = "Model " + model + " has not been loaded yet."; + auto [status, error] = CreateResponse(msg, 400); + callback(std::move(status), std::move(error)); + return; + } + port = model_process_map_[model].port; + } + + const std::string url = + "http://127.0.0.1:" + std::to_string(port) + "/v1/embeddings"; + const std::string json_str = json_body->toStyledString(); + + auto result = curl_utils::SimplePostJson(url, json_str); + + if (result.has_error()) { + auto [status, res] = CreateResponse(result.error(), 400); + callback(std::move(status), std::move(res)); + } + + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(result.value())); }; void VllmEngine::LoadModel( @@ -213,6 +258,10 @@ void VllmEngine::LoadModel( if (model_process_map_.find(model) != model_process_map_.end()) { auto proc = model_process_map_[model]; + // NOTE: each vLLM instance can only serve 1 task. It means that the + // following logic will not allow serving the same model for 2 different + // tasks at the same time. + // To support it, we also need to know how vLLM decides the default task. if (proc.IsAlive()) { auto [status, error] = CreateResponse("Model already loaded!", 409); callback(std::move(status), std::move(error)); @@ -263,6 +312,16 @@ void VllmEngine::LoadModel( cmd.push_back("--served-model-name"); cmd.push_back(model); + // NOTE: we might want to adjust max-model-len automatically, since vLLM + // may OOM for large models as it tries to allocate full context length. + const std::string EXTRA_ARGS[] = {"task", "max-model-len"}; + for (const auto arg : EXTRA_ARGS) { + if (json_body->isMember(arg)) { + cmd.push_back("--" + arg); + cmd.push_back((*json_body)[arg].asString()); + } + } + const auto stdout_file = env_dir / "stdout.log"; const auto stderr_file = env_dir / "stderr.log"; diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index bd79a6ce5..f1d38e76a 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -119,7 +119,9 @@ cpp::result InferenceService::HandleEmbedding( std::shared_ptr q, std::shared_ptr json_body) { std::string engine_type; if (!HasFieldInReq(json_body, "engine")) { - engine_type = kLlamaRepo; + auto engine_type_maybe = + GetEngineByModelId((*json_body)["model"].asString()); + engine_type = engine_type_maybe.empty() ? kLlamaRepo : engine_type_maybe; } else { engine_type = (*(json_body)).get("engine", kLlamaRepo).asString(); } From 5eea3452e4b3f4a552acf27be3468d7f557fd1d7 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 2 Apr 2025 21:26:43 +0800 Subject: [PATCH 72/73] remove old tests. some chores --- engine/e2e-test/api/engines/test_api_engine.py | 10 ---------- .../api/engines/test_api_engine_install_nightly.py | 4 ---- .../e2e-test/cli/engines/test_cli_engine_install.py | 11 ----------- .../cli/engines/test_cli_engine_install_nightly.py | 11 ----------- engine/extensions/python-engines/python_utils.cc | 2 +- engine/extensions/python-engines/vllm_engine.cc | 1 + 6 files changed, 2 insertions(+), 37 deletions(-) diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py index 22fadf5d0..dbdf2dbe9 100644 --- a/engine/e2e-test/api/engines/test_api_engine.py +++ b/engine/e2e-test/api/engines/test_api_engine.py @@ -52,16 +52,6 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self): response = requests.delete("http://localhost:3928/v1/engines/llama-cpp/install") assert response.status_code == 200 - @pytest.mark.asyncio - async def test_engines_install_uninstall_python_should_be_successful(self): - response = requests.post("http://localhost:3928/v1/engines/python-engine/install") - assert response.status_code == 200 - await wait_for_websocket_download_success_event(timeout=None) - time.sleep(30) - - response = requests.delete("http://localhost:3928/v1/engines/python-engine/install") - assert response.status_code == 200 - @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self): # install first diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py index 50dbbeee5..e92afb14b 100644 --- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py +++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py @@ -22,10 +22,6 @@ def test_engines_install_llamacpp_should_be_successful(self): response = requests.post("http://localhost:3928/v1/engines/llama-cpp/install") assert response.status_code == 200 - def test_engines_install_python_should_be_successful(self): - response = requests.post("http://localhost:3928/v1/engines/python-engine/install") - assert response.status_code == 200 - def test_engines_install_llamacpp_specific_version_and_variant(self): data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx"} response = requests.post( diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py index ca298c828..370ebe3f3 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_install.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py @@ -31,17 +31,6 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" - def test_engines_install_python_should_be_successfully(self): - exit_code, output, error = run( - "Install Engine", - ["engines", "install", "python-engine"], - timeout=None, - capture=False, - ) - response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine") - assert len(response.json()) > 0 - assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_install_onnx_on_macos_should_be_failed(self): exit_code, output, error = run( diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py index 68f09aaf3..42835c4a0 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_install_nightly.py @@ -31,17 +31,6 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" - def test_engines_install_python_should_be_successfully(self): - exit_code, output, error = run( - "Install Engine", - ["engines", "install", "python-engine"], - timeout=None, - capture=False, - ) - response = requests.get("http://127.0.0.1:3928/v1/engines/python-engine") - assert len(response.json()) > 0 - assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_install_onnx_on_macos_should_be_failed(self): exit_code, output, error = run( diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc index 005c36b7c..965b4c324 100644 --- a/engine/extensions/python-engines/python_utils.cc +++ b/engine/extensions/python-engines/python_utils.cc @@ -39,7 +39,7 @@ cpp::result UvInstall() { std::filesystem::create_directories(py_bin_path); // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release? - const std::string uv_version = "0.6.3"; + const std::string uv_version = "0.6.11"; // build download url based on system info std::stringstream fname_stream; diff --git a/engine/extensions/python-engines/vllm_engine.cc b/engine/extensions/python-engines/vllm_engine.cc index 5bdab068a..b05e651c5 100644 --- a/engine/extensions/python-engines/vllm_engine.cc +++ b/engine/extensions/python-engines/vllm_engine.cc @@ -81,6 +81,7 @@ std::vector VllmEngine::GetVariants() { std::vector variants; for (const auto& entry : fs::directory_iterator(vllm_path)) { const auto name = "linux-amd64-cuda"; // arbitrary + // TODO: after llama-server is merged, check if we need to add "v" const auto version_str = "v" + entry.path().filename().string(); const EngineVariantResponse variant{name, version_str, kVllmEngine}; variants.push_back(variant); From 2bde26a62235c5286c82923b87807c56e66361ee Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Wed, 2 Apr 2025 21:30:49 +0800 Subject: [PATCH 73/73] remove unused function --- engine/services/inference_service.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index f1d38e76a..86d452c75 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -4,14 +4,6 @@ #include "utils/function_calling/common.h" #include "utils/jinja_utils.h" -static InferResult GetUnsupportedResponse(const std::string& msg) { - Json::Value res, stt; - res["message"] = msg; - stt["status_code"] = drogon::k400BadRequest; - LOG_WARN << msg; - return std::make_pair(stt, res); -} - cpp::result InferenceService::HandleChatCompletion( std::shared_ptr q, std::shared_ptr json_body) { std::string engine_type;