ml-explore · awni · Jan 26, 2026 · Jan 23, 2026 · Jan 24, 2026 · Jan 25, 2026
diff --git a/docs/src/python/devices_and_streams.rst b/docs/src/python/devices_and_streams.rst
@@ -17,3 +17,5 @@ Devices and Streams
    set_default_stream
    stream
    synchronize
+   device_count
+   device_info
diff --git a/mlx/backend/cpu/available.cpp b/mlx/backend/cpu/available.cpp
@@ -2,10 +2,58 @@
 
 #include "mlx/backend/cpu/available.h"
 
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
 namespace mlx::core::cpu {
 
+namespace {
+
+// Get CPU architecture string
+std::string get_cpu_architecture() {
+#if defined(__aarch64__) || defined(__arm64__)
+  return "arm64";
+#elif defined(__x86_64__) || defined(_M_X64)
+  return "x86_64";
+#elif defined(__i386__) || defined(__i386) || defined(_M_IX86)
+  return "x86";
+#elif defined(__arm__) || defined(_M_ARM)
+  return "arm";
+#else
+  return "unknown";
+#endif
+}
+
+// Get CPU device name
+std::string get_cpu_name() {
+#ifdef __APPLE__
+  char model[256];
+  size_t len = sizeof(model);
+  if (sysctlbyname("machdep.cpu.brand_string", &model, &len, NULL, 0) == 0) {
+    return std::string(model);
+  }
+#endif
+  return get_cpu_architecture();
+}
+
+} // anonymous namespace
+
 bool is_available() {
   return true;
 }
 
+int device_count() {
+  return 1;
+}
+
+const std::unordered_map<std::string, std::variant<std::string, size_t>>&
+device_info(int /* device_index */) {
+  static auto info =
+      std::unordered_map<std::string, std::variant<std::string, size_t>>{
+          {"device_name", get_cpu_name()},
+          {"architecture", get_cpu_architecture()}};
+  return info;
+}
+
 } // namespace mlx::core::cpu
diff --git a/mlx/backend/cpu/available.h b/mlx/backend/cpu/available.h
@@ -2,8 +2,26 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
+#include <variant>
+
 namespace mlx::core::cpu {
 
 bool is_available();
 
+/**
+ * Get the number of available CPU devices.
+ *
+ * For CPU, always returns 1.
+ */
+int device_count();
+
+/**
+ * Get CPU device information.
+ *
+ * Returns a map with basic CPU device properties.
+ */
+const std::unordered_map<std::string, std::variant<std::string, size_t>>& device_info(int device_index = 0);
+
 } // namespace mlx::core::cpu
diff --git a/mlx/backend/cuda/cuda.cpp b/mlx/backend/cuda/cuda.cpp
@@ -2,10 +2,237 @@
 
 #include "mlx/backend/cuda/cuda.h"
 
+#include <cuda_runtime.h>
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
 namespace mlx::core::cu {
 
+namespace {
+
+// NVML dynamic loading for accurate memory reporting
+// (cudaMemGetInfo only sees current process)
+
+typedef int nvmlReturn_t;
+typedef struct nvmlDevice_st* nvmlDevice_t;
+struct nvmlMemory_t {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+};
+
+struct NVMLState {
+  void* handle = nullptr;
+  nvmlReturn_t (*nvmlInit_v2)() = nullptr;
+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char*, nvmlDevice_t*) =
+      nullptr;
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t*) =
+      nullptr;
+};
+
+bool nvml_init(NVMLState& nvml) {
+#ifdef _WIN32
+  nvml.handle = LoadLibraryA("nvml.dll");
+  if (!nvml.handle) {
+    nvml.handle =
+        LoadLibraryA("C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvml.dll");
+  }
+  if (!nvml.handle)
+    return false;
+  nvml.nvmlInit_v2 = (decltype(nvml.nvmlInit_v2))GetProcAddress(
+      (HMODULE)nvml.handle, "nvmlInit_v2");
+  nvml.nvmlDeviceGetHandleByUUID =
+      (decltype(nvml.nvmlDeviceGetHandleByUUID))GetProcAddress(
+          (HMODULE)nvml.handle, "nvmlDeviceGetHandleByUUID");
+  nvml.nvmlDeviceGetMemoryInfo =
+      (decltype(nvml.nvmlDeviceGetMemoryInfo))GetProcAddress(
+          (HMODULE)nvml.handle, "nvmlDeviceGetMemoryInfo");
+#else
+  nvml.handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
+  if (!nvml.handle)
+    return false;
+  nvml.nvmlInit_v2 =
+      (decltype(nvml.nvmlInit_v2))dlsym(nvml.handle, "nvmlInit_v2");
+  nvml.nvmlDeviceGetHandleByUUID =
+      (decltype(nvml.nvmlDeviceGetHandleByUUID))dlsym(
+          nvml.handle, "nvmlDeviceGetHandleByUUID");
+  nvml.nvmlDeviceGetMemoryInfo = (decltype(nvml.nvmlDeviceGetMemoryInfo))dlsym(
+      nvml.handle, "nvmlDeviceGetMemoryInfo");
+#endif
+
+  if (!nvml.nvmlInit_v2 || !nvml.nvmlDeviceGetHandleByUUID ||
+      !nvml.nvmlDeviceGetMemoryInfo) {
+    return false;
+  }
+  return nvml.nvmlInit_v2() == 0;
+}
+
+bool nvml_get_memory(
+    NVMLState& nvml,
+    const char* uuid,
+    size_t* free,
+    size_t* total) {
+  if (!nvml.handle)
+    return false;
+  nvmlDevice_t device;
+  if (nvml.nvmlDeviceGetHandleByUUID(uuid, &device) != 0)
+    return false;
+  nvmlMemory_t mem;
+  if (nvml.nvmlDeviceGetMemoryInfo(device, &mem) != 0)
+    return false;
+  *free = mem.free;
+  *total = mem.total;
+  return true;
+}
+
+std::string format_uuid(const cudaUUID_t& uuid) {
+  char buf[64];
+  snprintf(
+      buf,
+      sizeof(buf),
+      "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+      (unsigned char)uuid.bytes[0],
+      (unsigned char)uuid.bytes[1],
+      (unsigned char)uuid.bytes[2],
+      (unsigned char)uuid.bytes[3],
+      (unsigned char)uuid.bytes[4],
+      (unsigned char)uuid.bytes[5],
+      (unsigned char)uuid.bytes[6],
+      (unsigned char)uuid.bytes[7],
+      (unsigned char)uuid.bytes[8],
+      (unsigned char)uuid.bytes[9],
+      (unsigned char)uuid.bytes[10],
+      (unsigned char)uuid.bytes[11],
+      (unsigned char)uuid.bytes[12],
+      (unsigned char)uuid.bytes[13],
+      (unsigned char)uuid.bytes[14],
+      (unsigned char)uuid.bytes[15]);
+  return buf;
+}
+
+// Helper function to get memory info (NVML or cudaMemGetInfo)
+void get_memory_info(
+    int device_index,
+    const std::string& uuid,
+    NVMLState& nvml,
+    size_t* free_mem,
+    size_t* total_mem) {
+  if (nvml_get_memory(nvml, uuid.c_str(), free_mem, total_mem)) {
+    return;
+  }
+  // Fallback to cudaMemGetInfo
+  int prev_device;
+  cudaGetDevice(&prev_device);
+  cudaSetDevice(device_index);
+  cudaMemGetInfo(free_mem, total_mem);
+  cudaSetDevice(prev_device);
+}
+
+} // anonymous namespace
+
 bool is_available() {
   return true;
 }
 
+const std::unordered_map<std::string, std::variant<std::string, size_t>>&
+device_info(int device_index) {
+  // Static cache of device properties including UUID (needed for NVML lookup)
+  static auto all_devices = []() {
+    // Get device count
+    int count = 0;
+    cudaGetDeviceCount(&count);
+
+    // Collect info for all devices
+    struct DeviceInfo {
+      std::unordered_map<std::string, std::variant<std::string, size_t>> info;
+      std::string uuid;
+    };
+
+    std::vector<DeviceInfo> devices;
+
+    for (int i = 0; i < count; ++i) {
+      cudaDeviceProp prop;
+      cudaGetDeviceProperties(&prop, i);
+
+      DeviceInfo dev;
+      dev.info["device_name"] = std::string(prop.name);
+      dev.uuid = format_uuid(prop.uuid);
+      dev.info["uuid"] = dev.uuid;
+
+      // Architecture string (e.g., "sm_89")
+      char arch[16];
+      snprintf(arch, sizeof(arch), "sm_%d%d", prop.major, prop.minor);
+      dev.info["architecture"] = std::string(arch);
+
+      // PCI bus ID (domain:bus:device.function)
+      char pci_id[32];
+      snprintf(
+          pci_id,
+          sizeof(pci_id),
+          "%04x:%02x:%02x.0",
+          prop.pciDomainID,
+          prop.pciBusID,
+          prop.pciDeviceID);
+      dev.info["pci_bus_id"] = std::string(pci_id);
+
+      // Compute capability as size_t (to match Metal's variant type)
+      dev.info["compute_capability_major"] = static_cast<size_t>(prop.major);
+      dev.info["compute_capability_minor"] = static_cast<size_t>(prop.minor);
+
+      devices.push_back(std::move(dev));
+    }
+    return devices;
+  }();
+
+  // Initialize NVML once for fresh memory reads
+  static NVMLState nvml;
+  static bool nvml_initialized = nvml_init(nvml);
+
+  if (device_index < 0 ||
+      device_index >= static_cast<int>(all_devices.size())) {
+    static auto empty =
+        std::unordered_map<std::string, std::variant<std::string, size_t>>();
+    return empty;
+  }
+
+  // Return a copy with fresh memory info
+  // Using thread_local to avoid locks while keeping free_memory fresh
+  thread_local auto device_info_copy =
+      std::unordered_map<std::string, std::variant<std::string, size_t>>();
+
+  device_info_copy = all_devices[device_index].info;
+
+  // Get fresh memory info - try NVML first (system-wide), fallback to cudaMemGetInfo (process-level)
+  size_t free_mem, total_mem;
+
+  if (nvml_initialized &&
+      nvml_get_memory(
+          nvml,
+          all_devices[device_index].uuid.c_str(),
+          &free_mem,
+          &total_mem)) {
+    // NVML succeeded - use system-wide memory
+  } else {
+    // Fallback to cudaMemGetInfo (process-scoped)
+    int prev_device;
+    cudaGetDevice(&prev_device);
+    cudaSetDevice(device_index);
+    cudaMemGetInfo(&free_mem, &total_mem);
+    cudaSetDevice(prev_device);
+  }
+
+  device_info_copy["free_memory"] = free_mem;
+  device_info_copy["total_memory"] = total_mem;
+
+  return device_info_copy;
+}
+
 } // namespace mlx::core::cu
diff --git a/mlx/backend/cuda/cuda.h b/mlx/backend/cuda/cuda.h
@@ -2,11 +2,20 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
+#include <variant>
+
 #include "mlx/api.h"
 
 namespace mlx::core::cu {
 
 /* Check if the CUDA backend is available. */
 MLX_API bool is_available();
 
+/* Get information about a CUDA device. */
+MLX_API const
+    std::unordered_map<std::string, std::variant<std::string, size_t>>&
+    device_info(int device_index = 0);
+
 } // namespace mlx::core::cu
diff --git a/mlx/backend/gpu/CMakeLists.txt b/mlx/backend/gpu/CMakeLists.txt
@@ -1,5 +1,6 @@
 target_sources(
   mlx
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp)
diff --git a/mlx/backend/gpu/available.h b/mlx/backend/gpu/available.h
@@ -2,10 +2,35 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
+#include <variant>
+
 #include "mlx/api.h"
 
 namespace mlx::core::gpu {
 
 MLX_API bool is_available();
 
+/**
+ * Get the number of available GPU devices.
+ */
+MLX_API int device_count();
+
+/**
+ * Get information about a GPU device.
+ *
+ * Returns a map of device properties. Keys vary by backend:
+ *   - device_name (string): Device name
+ *   - architecture (string): Architecture identifier
+ *   - total_memory/memory_size (size_t): Total device memory
+ *   - free_memory (size_t): Available memory (CUDA only)
+ *   - uuid (string): Device UUID (CUDA only)
+ *   - pci_bus_id (string): PCI bus ID (CUDA only)
+ *   - compute_capability_major/minor (size_t): Compute capability (CUDA only)
+ */
+MLX_API const
+    std::unordered_map<std::string, std::variant<std::string, size_t>>&
+    device_info(int device_index = 0);
+
 } // namespace mlx::core::gpu