Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/src/python/devices_and_streams.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ Devices and Streams
set_default_stream
stream
synchronize
device_count
device_info
48 changes: 48 additions & 0 deletions mlx/backend/cpu/available.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,58 @@

#include "mlx/backend/cpu/available.h"

#ifdef __APPLE__
#include <sys/sysctl.h>
#endif

namespace mlx::core::cpu {

namespace {

// Get CPU architecture string
std::string get_cpu_architecture() {
#if defined(__aarch64__) || defined(__arm64__)
return "arm64";
#elif defined(__x86_64__) || defined(_M_X64)
return "x86_64";
#elif defined(__i386__) || defined(__i386) || defined(_M_IX86)
return "x86";
#elif defined(__arm__) || defined(_M_ARM)
return "arm";
#else
return "unknown";
#endif
}

// Get CPU device name
std::string get_cpu_name() {
#ifdef __APPLE__
char model[256];
size_t len = sizeof(model);
if (sysctlbyname("machdep.cpu.brand_string", &model, &len, NULL, 0) == 0) {
return std::string(model);
}
#endif
return get_cpu_architecture();
}

} // anonymous namespace

bool is_available() {
return true;
}

int device_count() {
return 1;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int /* device_index */) {
static auto info =
std::unordered_map<std::string, std::variant<std::string, size_t>>{
{"device_name", get_cpu_name()},
{"architecture", get_cpu_architecture()}};
return info;
}

} // namespace mlx::core::cpu
18 changes: 18 additions & 0 deletions mlx/backend/cpu/available.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,26 @@

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

namespace mlx::core::cpu {

bool is_available();

/**
* Get the number of available CPU devices.
*
* For CPU, always returns 1.
*/
int device_count();

/**
* Get CPU device information.
*
* Returns a map with basic CPU device properties.
*/
const std::unordered_map<std::string, std::variant<std::string, size_t>>& device_info(int device_index = 0);

} // namespace mlx::core::cpu
227 changes: 227 additions & 0 deletions mlx/backend/cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,237 @@

#include "mlx/backend/cuda/cuda.h"

#include <cuda_runtime.h>
#include <string>
#include <unordered_map>
#include <variant>
#include <vector>

#ifdef _WIN32
#include <windows.h>
#else
#include <dlfcn.h>
#endif

namespace mlx::core::cu {

namespace {

// NVML dynamic loading for accurate memory reporting
// (cudaMemGetInfo only sees current process)

typedef int nvmlReturn_t;
typedef struct nvmlDevice_st* nvmlDevice_t;
struct nvmlMemory_t {
unsigned long long total;
unsigned long long free;
unsigned long long used;
};

struct NVMLState {
void* handle = nullptr;
nvmlReturn_t (*nvmlInit_v2)() = nullptr;
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char*, nvmlDevice_t*) =
nullptr;
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t*) =
nullptr;
};

bool nvml_init(NVMLState& nvml) {
#ifdef _WIN32
nvml.handle = LoadLibraryA("nvml.dll");
if (!nvml.handle) {
nvml.handle =
LoadLibraryA("C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvml.dll");
}
if (!nvml.handle)
return false;
nvml.nvmlInit_v2 = (decltype(nvml.nvmlInit_v2))GetProcAddress(
(HMODULE)nvml.handle, "nvmlInit_v2");
nvml.nvmlDeviceGetHandleByUUID =
(decltype(nvml.nvmlDeviceGetHandleByUUID))GetProcAddress(
(HMODULE)nvml.handle, "nvmlDeviceGetHandleByUUID");
nvml.nvmlDeviceGetMemoryInfo =
(decltype(nvml.nvmlDeviceGetMemoryInfo))GetProcAddress(
(HMODULE)nvml.handle, "nvmlDeviceGetMemoryInfo");
#else
nvml.handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
if (!nvml.handle)
return false;
nvml.nvmlInit_v2 =
(decltype(nvml.nvmlInit_v2))dlsym(nvml.handle, "nvmlInit_v2");
nvml.nvmlDeviceGetHandleByUUID =
(decltype(nvml.nvmlDeviceGetHandleByUUID))dlsym(
nvml.handle, "nvmlDeviceGetHandleByUUID");
nvml.nvmlDeviceGetMemoryInfo = (decltype(nvml.nvmlDeviceGetMemoryInfo))dlsym(
nvml.handle, "nvmlDeviceGetMemoryInfo");
#endif

if (!nvml.nvmlInit_v2 || !nvml.nvmlDeviceGetHandleByUUID ||
!nvml.nvmlDeviceGetMemoryInfo) {
return false;
}
return nvml.nvmlInit_v2() == 0;
}

bool nvml_get_memory(
NVMLState& nvml,
const char* uuid,
size_t* free,
size_t* total) {
if (!nvml.handle)
return false;
nvmlDevice_t device;
if (nvml.nvmlDeviceGetHandleByUUID(uuid, &device) != 0)
return false;
nvmlMemory_t mem;
if (nvml.nvmlDeviceGetMemoryInfo(device, &mem) != 0)
return false;
*free = mem.free;
*total = mem.total;
return true;
}

std::string format_uuid(const cudaUUID_t& uuid) {
char buf[64];
snprintf(
buf,
sizeof(buf),
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
(unsigned char)uuid.bytes[0],
(unsigned char)uuid.bytes[1],
(unsigned char)uuid.bytes[2],
(unsigned char)uuid.bytes[3],
(unsigned char)uuid.bytes[4],
(unsigned char)uuid.bytes[5],
(unsigned char)uuid.bytes[6],
(unsigned char)uuid.bytes[7],
(unsigned char)uuid.bytes[8],
(unsigned char)uuid.bytes[9],
(unsigned char)uuid.bytes[10],
(unsigned char)uuid.bytes[11],
(unsigned char)uuid.bytes[12],
(unsigned char)uuid.bytes[13],
(unsigned char)uuid.bytes[14],
(unsigned char)uuid.bytes[15]);
return buf;
}

// Helper function to get memory info (NVML or cudaMemGetInfo)
void get_memory_info(
int device_index,
const std::string& uuid,
NVMLState& nvml,
size_t* free_mem,
size_t* total_mem) {
if (nvml_get_memory(nvml, uuid.c_str(), free_mem, total_mem)) {
return;
}
// Fallback to cudaMemGetInfo
int prev_device;
cudaGetDevice(&prev_device);
cudaSetDevice(device_index);
cudaMemGetInfo(free_mem, total_mem);
cudaSetDevice(prev_device);
}

} // anonymous namespace

bool is_available() {
return true;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int device_index) {
// Static cache of device properties including UUID (needed for NVML lookup)
static auto all_devices = []() {
// Get device count
int count = 0;
cudaGetDeviceCount(&count);

// Collect info for all devices
struct DeviceInfo {
std::unordered_map<std::string, std::variant<std::string, size_t>> info;
std::string uuid;
};

std::vector<DeviceInfo> devices;

for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);

DeviceInfo dev;
dev.info["device_name"] = std::string(prop.name);
dev.uuid = format_uuid(prop.uuid);
dev.info["uuid"] = dev.uuid;

// Architecture string (e.g., "sm_89")
char arch[16];
snprintf(arch, sizeof(arch), "sm_%d%d", prop.major, prop.minor);
dev.info["architecture"] = std::string(arch);

// PCI bus ID (domain:bus:device.function)
char pci_id[32];
snprintf(
pci_id,
sizeof(pci_id),
"%04x:%02x:%02x.0",
prop.pciDomainID,
prop.pciBusID,
prop.pciDeviceID);
dev.info["pci_bus_id"] = std::string(pci_id);

// Compute capability as size_t (to match Metal's variant type)
dev.info["compute_capability_major"] = static_cast<size_t>(prop.major);
dev.info["compute_capability_minor"] = static_cast<size_t>(prop.minor);

devices.push_back(std::move(dev));
}
return devices;
}();

// Initialize NVML once for fresh memory reads
static NVMLState nvml;
static bool nvml_initialized = nvml_init(nvml);

if (device_index < 0 ||
device_index >= static_cast<int>(all_devices.size())) {
static auto empty =
std::unordered_map<std::string, std::variant<std::string, size_t>>();
return empty;
}

// Return a copy with fresh memory info
// Using thread_local to avoid locks while keeping free_memory fresh
thread_local auto device_info_copy =
std::unordered_map<std::string, std::variant<std::string, size_t>>();

device_info_copy = all_devices[device_index].info;

// Get fresh memory info - try NVML first (system-wide), fallback to cudaMemGetInfo (process-level)
size_t free_mem, total_mem;

if (nvml_initialized &&
nvml_get_memory(
nvml,
all_devices[device_index].uuid.c_str(),
&free_mem,
&total_mem)) {
// NVML succeeded - use system-wide memory
} else {
// Fallback to cudaMemGetInfo (process-scoped)
int prev_device;
cudaGetDevice(&prev_device);
cudaSetDevice(device_index);
cudaMemGetInfo(&free_mem, &total_mem);
cudaSetDevice(prev_device);
}

device_info_copy["free_memory"] = free_mem;
device_info_copy["total_memory"] = total_mem;

return device_info_copy;
}

} // namespace mlx::core::cu
9 changes: 9 additions & 0 deletions mlx/backend/cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

#include "mlx/api.h"

namespace mlx::core::cu {

/* Check if the CUDA backend is available. */
MLX_API bool is_available();

/* Get information about a CUDA device. */
MLX_API const
std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int device_index = 0);

} // namespace mlx::core::cu
1 change: 1 addition & 0 deletions mlx/backend/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
target_sources(
mlx
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp)
25 changes: 25 additions & 0 deletions mlx/backend/gpu/available.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,35 @@

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

#include "mlx/api.h"

namespace mlx::core::gpu {

MLX_API bool is_available();

/**
* Get the number of available GPU devices.
*/
MLX_API int device_count();

/**
* Get information about a GPU device.
*
* Returns a map of device properties. Keys vary by backend:
* - device_name (string): Device name
* - architecture (string): Architecture identifier
* - total_memory/memory_size (size_t): Total device memory
* - free_memory (size_t): Available memory (CUDA only)
* - uuid (string): Device UUID (CUDA only)
* - pci_bus_id (string): PCI bus ID (CUDA only)
* - compute_capability_major/minor (size_t): Compute capability (CUDA only)
*/
MLX_API const
std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int device_index = 0);

} // namespace mlx::core::gpu
Loading
Loading