From 7c178f6001cd4eb1301ffede9c8ef60819747b09 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Thu, 2 Oct 2025 15:53:57 +0000 Subject: [PATCH 1/7] Add fast-track to skip uninstall/install if NVIDIA driver modules present Signed-off-by: Karthik Vetrivel --- ubuntu24.04/nvidia-driver | 49 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index 2449628f..6662c34d 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -244,6 +244,33 @@ _get_module_params() { fi } +# Read the currently loaded NVIDIA driver version from sysfs. +_read_loaded_version() { + cat /sys/module/nvidia/version 2>/dev/null || return 1 +} + +_is_rootfs_mounted() { + findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 +} + +# Ensure the driver rootfs is mounted exactly once. +_ensure_rootfs_mounted_idempotent() { + _is_rootfs_mounted || _mount_rootfs +} + +_ensure_persistence_running() { + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} + # Load the kernel modules and start persistenced. _load_driver() { echo "Parsing kernel module parameters..." @@ -584,7 +611,27 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - _unload_driver || exit 1 + # Fast path: if the NVIDIA kernel modules are already loaded and match the desired + # version, avoid any heavy reinstall/build. Ensure rootfs is mounted and + # persistenced is running, then hold the container. + if [ -f /sys/module/nvidia/refcnt ]; then + loaded_version=$(_read_loaded_version || true) + if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then + echo "Detected matching loaded driver (${loaded_version}); skipping reinstall" + _ensure_rootfs_mounted_idempotent + _ensure_persistence_running + _write_kernel_update_hook + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 + fi + fi + + + _unload_driver || exit 1 _unmount_rootfs _update_ca_certificates From a8dbb15864e5e969a44556c33da72d6504ebba07 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Thu, 16 Oct 2025 20:57:33 +0000 Subject: [PATCH 2/7] feat: implement userspace-only reinstall for non-clean driver restarts and fix scenario handling Signed-off-by: Karthik Vetrivel --- ubuntu22.04/nvidia-driver | 109 ++++++++++++++++++++++++++++++++------ ubuntu24.04/nvidia-driver | 42 +++++++++++++-- 2 files changed, 130 insertions(+), 21 deletions(-) diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index da3ec8e3..532f0b33 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -638,11 +638,104 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +# Read the currently loaded NVIDIA driver version from sysfs. +_read_loaded_version() { + cat /sys/module/nvidia/version 2>/dev/null || return 1 +} + +_is_rootfs_mounted() { + findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 +} + +# Ensure the driver rootfs is mounted exactly once. +_ensure_rootfs_mounted_idempotent() { + _is_rootfs_mounted || _mount_rootfs +} + +_ensure_persistence_running() { + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} + init() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 fi + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + # Fast path: if the NVIDIA kernel modules are already loaded and match the desired + # version, skip kernel module build/load but install userspace components. + # This handles non-clean restarts where modules are in use and can't be unloaded. + if [ -f /sys/module/nvidia/refcnt ]; then + loaded_version=$(_read_loaded_version || true) + if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then + echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" + + # Skip kernel module unload since they're already loaded with correct version + # Unmount any existing rootfs + _unmount_rootfs + + # Update package cache for userspace install + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + + # Install userspace components only (libraries, binaries) + # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + # Extract the driver first + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-drm \ + --no-peermem + + # Mount the driver rootfs to make components available + _mount_rootfs + + # Ensure persistence daemon is running + _ensure_persistence_running + + # Write kernel update hook + _write_kernel_update_hook + + echo "Userspace-only install complete, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 + fi + fi + + _unload_driver || exit 1 + _unmount_rootfs + # Install the userspace components sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ @@ -668,22 +761,6 @@ init() { mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest - echo -e "\n========== NVIDIA Software Installer ==========\n" - echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" - - exec 3> ${PID_FILE} - if ! flock -n 3; then - echo "An instance of the NVIDIA driver is already running, aborting" - exit 1 - fi - echo $$ >&3 - - trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM - trap "_shutdown" EXIT - - _unload_driver || exit 1 - _unmount_rootfs - if _kernel_requires_package; then _update_ca_certificates _update_package_cache diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index 6662c34d..9bc501c2 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -612,16 +612,48 @@ init() { trap "_shutdown" EXIT # Fast path: if the NVIDIA kernel modules are already loaded and match the desired - # version, avoid any heavy reinstall/build. Ensure rootfs is mounted and - # persistenced is running, then hold the container. + # version, skip kernel module build/load but install userspace components. + # This handles non-clean restarts where modules are in use and can't be unloaded. if [ -f /sys/module/nvidia/refcnt ]; then loaded_version=$(_read_loaded_version || true) if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then - echo "Detected matching loaded driver (${loaded_version}); skipping reinstall" - _ensure_rootfs_mounted_idempotent + echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" + + # Skip kernel module unload since they're already loaded with correct version + # Unmount any existing rootfs + _unmount_rootfs + + # Update package cache for userspace install + _update_ca_certificates + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + + # Install userspace components only (libraries, binaries) + # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + # Extract the driver first + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-drm \ + --no-peermem + + # Mount the driver rootfs to make components available + _mount_rootfs + + # Ensure persistence daemon is running _ensure_persistence_running + + # Write kernel update hook _write_kernel_update_hook - echo "Done, now waiting for signal" + + echo "Userspace-only install complete, now waiting for signal" sleep infinity & trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM trap - EXIT From ba7e6de0007afc5bf536d39aaa2a39f45667b3aa Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Wed, 5 Nov 2025 21:06:31 +0000 Subject: [PATCH 3/7] Store driver config state and compare on restart to enable config change detection Signed-off-by: Karthik Vetrivel --- ubuntu22.04/nvidia-driver | 84 +++++++++++++++++++++++++-------------- ubuntu24.04/nvidia-driver | 81 +------------------------------------ 2 files changed, 56 insertions(+), 109 deletions(-) diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 532f0b33..85360952 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -530,6 +530,7 @@ _mount_rootfs() { mount --make-private /sys mkdir -p ${RUN_DIR}/driver mount --rbind / ${RUN_DIR}/driver + echo "Driver container rootfs mounted at ${RUN_DIR}/driver" } # Unmount the driver rootfs from the run directory. @@ -638,20 +639,6 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -# Read the currently loaded NVIDIA driver version from sysfs. -_read_loaded_version() { - cat /sys/module/nvidia/version 2>/dev/null || return 1 -} - -_is_rootfs_mounted() { - findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 -} - -# Ensure the driver rootfs is mounted exactly once. -_ensure_rootfs_mounted_idempotent() { - _is_rootfs_mounted || _mount_rootfs -} - _ensure_persistence_running() { local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then @@ -665,6 +652,31 @@ _ensure_persistence_running() { fi } +_build_driver_config() { + local config="DRIVER_VERSION=${DRIVER_VERSION} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED} +USE_HOST_MOFED=${USE_HOST_MOFED} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}" + + # Append config file contents directly + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +_store_driver_config() { + local config_file="/run/nvidia/driver-config.state" + echo "Storing driver configuration state..." + _build_driver_config > "$config_file" + echo "Driver configuration stored at $config_file" +} + init() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -683,13 +695,15 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - # Fast path: if the NVIDIA kernel modules are already loaded and match the desired - # version, skip kernel module build/load but install userspace components. + # Fast path: if the NVIDIA kernel modules are already loaded and driver config matches, + # skip kernel module build/load but install userspace components. # This handles non-clean restarts where modules are in use and can't be unloaded. - if [ -f /sys/module/nvidia/refcnt ]; then - loaded_version=$(_read_loaded_version || true) - if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then - echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" + if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then + current_config=$(_build_driver_config) + stored_config=$(cat /run/nvidia/driver-config.state) + + if [ "${current_config}" = "${stored_config}" ]; then + echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" # Skip kernel module unload since they're already loaded with correct version # Unmount any existing rootfs @@ -715,16 +729,27 @@ init() { --no-drm \ --no-peermem - # Mount the driver rootfs to make components available - _mount_rootfs - - # Ensure persistence daemon is running - _ensure_persistence_running - - # Write kernel update hook - _write_kernel_update_hook + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + # Copy the kernel module sources for sidecar containers (gdrcopy, nvidia-fs, etc.) + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ + cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest - echo "Userspace-only install complete, now waiting for signal" + # Mount the driver rootfs to make components available + _mount_rootfs + + # Ensure persistence daemon is running + _ensure_persistence_running + + # Write kernel update hook + _write_kernel_update_hook + + # Store driver configuration + _store_driver_config + + echo "Userspace-only install complete, now waiting for signal" sleep infinity & trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM trap - EXIT @@ -776,6 +801,7 @@ init() { _load_driver || exit 1 _mount_rootfs _write_kernel_update_hook + _store_driver_config echo "Done, now waiting for signal" sleep infinity & diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index 9bc501c2..2449628f 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -244,33 +244,6 @@ _get_module_params() { fi } -# Read the currently loaded NVIDIA driver version from sysfs. -_read_loaded_version() { - cat /sys/module/nvidia/version 2>/dev/null || return 1 -} - -_is_rootfs_mounted() { - findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 -} - -# Ensure the driver rootfs is mounted exactly once. -_ensure_rootfs_mounted_idempotent() { - _is_rootfs_mounted || _mount_rootfs -} - -_ensure_persistence_running() { - local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid - if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then - return 0 - fi - - if command -v nvidia-persistenced >/dev/null 2>&1; then - nvidia-persistenced --persistence-mode || true - else - echo "nvidia-persistenced not found; continuing without persistence" - fi -} - # Load the kernel modules and start persistenced. _load_driver() { echo "Parsing kernel module parameters..." @@ -611,59 +584,7 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - # Fast path: if the NVIDIA kernel modules are already loaded and match the desired - # version, skip kernel module build/load but install userspace components. - # This handles non-clean restarts where modules are in use and can't be unloaded. - if [ -f /sys/module/nvidia/refcnt ]; then - loaded_version=$(_read_loaded_version || true) - if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then - echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" - - # Skip kernel module unload since they're already loaded with correct version - # Unmount any existing rootfs - _unmount_rootfs - - # Update package cache for userspace install - _update_ca_certificates - _update_package_cache - _resolve_kernel_version || exit 1 - _install_prerequisites - - # Install userspace components only (libraries, binaries) - # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install - echo "Installing userspace components (libraries and binaries)..." - cd /drivers - # Extract the driver first - sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x - cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} - ./nvidia-installer \ - --silent \ - --no-kernel-module \ - --no-nouveau-check \ - --no-nvidia-modprobe \ - --no-drm \ - --no-peermem - - # Mount the driver rootfs to make components available - _mount_rootfs - - # Ensure persistence daemon is running - _ensure_persistence_running - - # Write kernel update hook - _write_kernel_update_hook - - echo "Userspace-only install complete, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 - fi - fi - - - _unload_driver || exit 1 + _unload_driver || exit 1 _unmount_rootfs _update_ca_certificates From 0a036ed9bddc8e322520bff0ff4e1ba348c042f1 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Fri, 14 Nov 2025 16:04:45 +0000 Subject: [PATCH 4/7] Add support for OpenShift 14.04 Signed-off-by: Karthik Vetrivel --- rhel9/nvidia-driver | 167 ++++++++++++++++++++++++++++++++++++-- rhel9/ocp_dtk_entrypoint | 51 ++++++++++++ ubuntu22.04/nvidia-driver | 26 +++++- 3 files changed, 233 insertions(+), 11 deletions(-) mode change 100755 => 100644 rhel9/nvidia-driver mode change 100755 => 100644 rhel9/ocp_dtk_entrypoint diff --git a/rhel9/nvidia-driver b/rhel9/nvidia-driver old mode 100755 new mode 100644 index 8ecd8b1e..f2ac57d0 --- a/rhel9/nvidia-driver +++ b/rhel9/nvidia-driver @@ -8,12 +8,13 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver NUM_VGPU_DEVICES=0 +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" NVIDIA_MODULE_PARAMS=() NVIDIA_UVM_MODULE_PARAMS=() NVIDIA_MODESET_MODULE_PARAMS=() NVIDIA_PEERMEM_MODULE_PARAMS=() TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} -USE_HOST_MOFED="${USE_HOST_MOFED:-false}" DNF_RELEASEVER=${DNF_RELEASEVER:-""} RHEL_VERSION=${RHEL_VERSION:-""} RHEL_MAJOR_VERSION=9 @@ -211,7 +212,10 @@ _create_driver_package() ( local nvidia_modeset_sign_args="" local nvidia_uvm_sign_args="" - trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT + # Skip cleanup trap for DTK builds - modules are copied after this function returns + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT + fi echo "Compiling NVIDIA driver kernel modules..." cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} @@ -566,11 +570,7 @@ _install_driver() { install_args+=("--skip-module-load") fi - IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} - # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path - # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point - # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit - #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} + IGNORE_CC_MISMATCH=1 nvidia-installer --silent --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} } # Mount the driver rootfs into the run directory with the exception of sysfs. @@ -701,6 +701,114 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_ensure_persistence() { + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} + +_build_driver_config() { + local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" + + # Read module parameters from conf files + if [ -f "/drivers/nvidia.conf" ]; then + nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-uvm.conf" ]; then + nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-modeset.conf" ]; then + nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-peermem.conf" ]; then + nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') + fi + + local config="DRIVER_VERSION=${DRIVER_VERSION} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} +USE_HOST_MOFED=${USE_HOST_MOFED:-false} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} +NVIDIA_MODULE_PARAMS=${nvidia_params} +NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} +NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} +NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" + + # Append config file contents directly + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +_store_driver_config() { + local config_file="/run/nvidia/driver-config.state" + echo "Storing driver configuration state..." + _build_driver_config > "$config_file" + echo "Driver configuration stored at $config_file" +} + +_should_use_fast_path() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 + local current_config=$(_build_driver_config) + local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") + [ "${current_config}" = "${stored_config}" ] +} + +_userspace_only_install() { + echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" + + _unmount_rootfs + _update_package_cache + + # Skip kernel-related steps for userspace-only install + # KERNEL_VERSION is already set from uname -r, no need to resolve from yum + # Kernel headers/devel/modules are not needed for userspace-only install + + cd /drivers + [ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + + + echo "DEBUG: Current directory: $(pwd)" + echo "DEBUG: Checking for ./nvidia-installer:" + ls -la ./nvidia-installer 2>&1 || echo " ./nvidia-installer NOT FOUND" + echo "DEBUG: Checking PATH for nvidia-installer:" + which nvidia-installer 2>&1 || echo " nvidia-installer NOT in PATH" + + + echo "Installing userspace components (libraries and binaries)..." + local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none" + [ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license" + IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args + + # Copy kernel module sources if not already present (needed for other containers) + if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then + _resolve_kernel_type || exit 1 + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} + cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest + fi + + _mount_rootfs + _ensure_persistence + _write_kernel_update_hook + _store_driver_config + + echo "Userspace-only install complete" +} + _prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -758,6 +866,7 @@ _load() { _load_driver _mount_rootfs _write_kernel_update_hook + _store_driver_config echo "Done, now waiting for signal" sleep infinity & @@ -768,7 +877,49 @@ _load() { } init() { - _prepare_exclusive + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + _find_vgpu_driver_version || exit 1 + fi + + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + if _should_use_fast_path; then + _userspace_only_install + + echo "Userspace-only install complete, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 + fi + + _unload_driver || exit 1 + _unmount_rootfs + + # Install the userspace components and copy the kernel module sources. + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh /tmp/install.sh nvinstall + + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + # Copy the kernel module sources + mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest _build diff --git a/rhel9/ocp_dtk_entrypoint b/rhel9/ocp_dtk_entrypoint old mode 100755 new mode 100644 index 0bd1496d..8513a6bc --- a/rhel9/ocp_dtk_entrypoint +++ b/rhel9/ocp_dtk_entrypoint @@ -10,6 +10,50 @@ echo "Running $*" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) source $SCRIPT_DIR/common.sh +_build_driver_config() { + local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" + + # Read module parameters from conf files + if [ -f "/drivers/nvidia.conf" ]; then + nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-uvm.conf" ]; then + nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-modeset.conf" ]; then + nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-peermem.conf" ]; then + nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') + fi + + local config="DRIVER_VERSION=${DRIVER_VERSION} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} +USE_HOST_MOFED=${USE_HOST_MOFED:-false} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} +NVIDIA_MODULE_PARAMS=${nvidia_params} +NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} +NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} +NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" + + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +_should_use_fast_path() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 + local current_config=$(_build_driver_config) + local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") + [ "${current_config}" = "${stored_config}" ] +} + nv-ctr-run-with-dtk() { set -x @@ -18,6 +62,13 @@ nv-ctr-run-with-dtk() { exec bash -x nvidia-driver init fi + if _should_use_fast_path; then + echo "Fast path detected: skipping DTK build and module copy, proceeding with userspace-only install" + exec bash -x nvidia-driver init + fi + + echo "Fast path not detected: building driver and modules" + if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then cp -r \ /tmp/install.sh \ diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 85360952..57069a54 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -639,7 +639,7 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -_ensure_persistence_running() { +_ensure_persistenced() { local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then return 0 @@ -653,11 +653,31 @@ _ensure_persistence_running() { } _build_driver_config() { + local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" + + # Read module parameters from conf files + if [ -f "/drivers/nvidia.conf" ]; then + nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-uvm.conf" ]; then + nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-modeset.conf" ]; then + nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-peermem.conf" ]; then + nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') + fi + local config="DRIVER_VERSION=${DRIVER_VERSION} KERNEL_VERSION=$(uname -r) GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED} USE_HOST_MOFED=${USE_HOST_MOFED} -KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}" +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE} +NVIDIA_MODULE_PARAMS=${nvidia_params} +NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} +NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} +NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" # Append config file contents directly for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do @@ -741,7 +761,7 @@ init() { _mount_rootfs # Ensure persistence daemon is running - _ensure_persistence_running + _ensure_persistenced # Write kernel update hook _write_kernel_update_hook From d4a6dfff8c109907cffc6e94ed5730a0cf0858e4 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Thu, 20 Nov 2025 18:25:06 +0000 Subject: [PATCH 5/7] Refactor fast-path logic in OpenShift Signed-off-by: Karthik Vetrivel --- rhel9/common.sh | 48 +++++++++++++++ rhel9/nvidia-driver | 123 +++++++++++++------------------------- rhel9/ocp_dtk_entrypoint | 57 ++++-------------- ubuntu22.04/nvidia-driver | 1 + 4 files changed, 103 insertions(+), 126 deletions(-) diff --git a/rhel9/common.sh b/rhel9/common.sh index a41a14a1..46f61e76 100755 --- a/rhel9/common.sh +++ b/rhel9/common.sh @@ -45,3 +45,51 @@ _gdrcopy_enabled() { fi return 1 } + +# Build driver configuration for state comparison +_build_driver_config() { + local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" + + # Read module parameters from conf files + if [ -f "/drivers/nvidia.conf" ]; then + nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-uvm.conf" ]; then + nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-modeset.conf" ]; then + nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') + fi + if [ -f "/drivers/nvidia-peermem.conf" ]; then + nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') + fi + + local config="DRIVER_VERSION=${DRIVER_VERSION} +DRIVER_TYPE=${DRIVER_TYPE:-passthrough} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} +USE_HOST_MOFED=${USE_HOST_MOFED:-false} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} +NVIDIA_MODULE_PARAMS=${nvidia_params} +NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} +NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} +NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" + + # Append config file contents directly + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +# Check if fast path should be used (driver already loaded with matching config) +_should_use_fast_path() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 + local current_config=$(_build_driver_config) + local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") + [ "${current_config}" = "${stored_config}" ] +} diff --git a/rhel9/nvidia-driver b/rhel9/nvidia-driver index f2ac57d0..e31c33ce 100644 --- a/rhel9/nvidia-driver +++ b/rhel9/nvidia-driver @@ -402,44 +402,7 @@ _load_driver() { set +o xtrace -o nounset fi - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode - - if [ "${DRIVER_TYPE}" = "vgpu" ]; then - echo "Copying gridd.conf..." - cp /drivers/gridd.conf /etc/nvidia/gridd.conf - if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then - echo "Copying ClientConfigToken..." - mkdir -p /etc/nvidia/ClientConfigToken/ - cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ - fi - - echo "Starting nvidia-gridd.." - LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd - - # Start virtual topology daemon - _start_vgpu_topology_daemon - fi - - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _start_daemons } # Stop persistenced and unload the kernel modules if they are currently loaded. @@ -714,42 +677,45 @@ _ensure_persistence() { fi } -_build_driver_config() { - local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" - - # Read module parameters from conf files - if [ -f "/drivers/nvidia.conf" ]; then - nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-uvm.conf" ]; then - nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-modeset.conf" ]; then - nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-peermem.conf" ]; then - nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') - fi - - local config="DRIVER_VERSION=${DRIVER_VERSION} -KERNEL_VERSION=$(uname -r) -GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} -USE_HOST_MOFED=${USE_HOST_MOFED:-false} -KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} -NVIDIA_MODULE_PARAMS=${nvidia_params} -NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} -NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} -NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" - - # Append config file contents directly - for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do - if [ -f "/drivers/$conf_file" ]; then - config="${config} -$(cat "/drivers/$conf_file")" - fi - done - - echo "$config" +_start_daemons() { + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi } _store_driver_config() { @@ -759,13 +725,6 @@ _store_driver_config() { echo "Driver configuration stored at $config_file" } -_should_use_fast_path() { - [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 - local current_config=$(_build_driver_config) - local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") - [ "${current_config}" = "${stored_config}" ] -} - _userspace_only_install() { echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" @@ -802,7 +761,7 @@ _userspace_only_install() { fi _mount_rootfs - _ensure_persistence + _start_daemons _write_kernel_update_hook _store_driver_config diff --git a/rhel9/ocp_dtk_entrypoint b/rhel9/ocp_dtk_entrypoint index 8513a6bc..23eab2df 100644 --- a/rhel9/ocp_dtk_entrypoint +++ b/rhel9/ocp_dtk_entrypoint @@ -10,50 +10,6 @@ echo "Running $*" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) source $SCRIPT_DIR/common.sh -_build_driver_config() { - local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" - - # Read module parameters from conf files - if [ -f "/drivers/nvidia.conf" ]; then - nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-uvm.conf" ]; then - nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-modeset.conf" ]; then - nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-peermem.conf" ]; then - nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') - fi - - local config="DRIVER_VERSION=${DRIVER_VERSION} -KERNEL_VERSION=$(uname -r) -GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} -USE_HOST_MOFED=${USE_HOST_MOFED:-false} -KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} -NVIDIA_MODULE_PARAMS=${nvidia_params} -NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} -NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} -NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" - - for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do - if [ -f "/drivers/$conf_file" ]; then - config="${config} -$(cat "/drivers/$conf_file")" - fi - done - - echo "$config" -} - -_should_use_fast_path() { - [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 - local current_config=$(_build_driver_config) - local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") - [ "${current_config}" = "${stored_config}" ] -} - nv-ctr-run-with-dtk() { set -x @@ -131,6 +87,19 @@ dtk-build-driver() { sleep inf fi + # Check if fast path is being used - if so, skip building and signal completion + if _should_use_fast_path; then + echo "Fast path detected in DTK container: driver already loaded with matching config, skipping build" + echo "Signaling driver_built and sleeping forever..." + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" + while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do + sleep 30 + done + echo "WARNING: driver_built flag disappeared" + exit 0 + fi + if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then echo "WARNING: broken Driver Toolkit image detected:" echo "- Node kernel: $(uname -r)" diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 57069a54..32b26999 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -670,6 +670,7 @@ _build_driver_config() { fi local config="DRIVER_VERSION=${DRIVER_VERSION} +DRIVER_TYPE=${DRIVER_TYPE:-passthrough} KERNEL_VERSION=$(uname -r) GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED} USE_HOST_MOFED=${USE_HOST_MOFED} From b660caa7dccb624ca9ab06e0b0759f9dcb907fa4 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Mon, 8 Dec 2025 19:45:57 +0000 Subject: [PATCH 6/7] Extract common code into reusable helper functions Signed-off-by: Karthik Vetrivel --- rhel9/common.sh | 44 +++----- rhel9/nvidia-driver | 55 ++++----- ubuntu22.04/nvidia-driver | 228 +++++++++++++++----------------------- 3 files changed, 123 insertions(+), 204 deletions(-) diff --git a/rhel9/common.sh b/rhel9/common.sh index 46f61e76..22d81857 100755 --- a/rhel9/common.sh +++ b/rhel9/common.sh @@ -46,44 +46,26 @@ _gdrcopy_enabled() { return 1 } +# Read a config file and convert newlines to spaces +_read_conf_file() { + local file="$1" + [ -f "$file" ] && tr '\n' ' ' < "$file" +} + # Build driver configuration for state comparison _build_driver_config() { - local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" - - # Read module parameters from conf files - if [ -f "/drivers/nvidia.conf" ]; then - nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-uvm.conf" ]; then - nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-modeset.conf" ]; then - nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-peermem.conf" ]; then - nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') - fi - - local config="DRIVER_VERSION=${DRIVER_VERSION} + cat < "$config_file" - echo "Driver configuration stored at $config_file" + local config_file="/run/nvidia/driver-config.state" + echo "Storing driver configuration state..." + _build_driver_config > "$config_file" + echo "Driver configuration stored at $config_file" +} + +_wait_for_signal() { + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 } _userspace_only_install() { echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" - _unmount_rootfs _update_package_cache - - # Skip kernel-related steps for userspace-only install - # KERNEL_VERSION is already set from uname -r, no need to resolve from yum - # Kernel headers/devel/modules are not needed for userspace-only install - + cd /drivers [ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} - - - echo "DEBUG: Current directory: $(pwd)" - echo "DEBUG: Checking for ./nvidia-installer:" - ls -la ./nvidia-installer 2>&1 || echo " ./nvidia-installer NOT FOUND" - echo "DEBUG: Checking PATH for nvidia-installer:" - which nvidia-installer 2>&1 || echo " nvidia-installer NOT in PATH" - echo "Installing userspace components (libraries and binaries)..." local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none" [ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license" IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args - - # Copy kernel module sources if not already present (needed for other containers) + + # Copy kernel module sources if not already present (needed for sidecar containers) if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then _resolve_kernel_type || exit 1 mkdir -p /usr/src/nvidia-${DRIVER_VERSION} cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest fi - + _mount_rootfs _start_daemons _write_kernel_update_hook _store_driver_config - echo "Userspace-only install complete" } @@ -826,13 +821,7 @@ _load() { _mount_rootfs _write_kernel_update_hook _store_driver_config - - echo "Done, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _wait_for_signal } init() { @@ -855,13 +844,7 @@ init() { if _should_use_fast_path; then _userspace_only_install - - echo "Userspace-only install complete, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _wait_for_signal fi _unload_driver || exit 1 diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 32b26999..6e42cada 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -640,62 +640,79 @@ _start_vgpu_topology_daemon() { } _ensure_persistenced() { - local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid - if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then - return 0 - fi + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} - if command -v nvidia-persistenced >/dev/null 2>&1; then - nvidia-persistenced --persistence-mode || true - else - echo "nvidia-persistenced not found; continuing without persistence" - fi +_read_conf_file() { + local file="$1" + [ -f "$file" ] && tr '\n' ' ' < "$file" } _build_driver_config() { - local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" - - # Read module parameters from conf files - if [ -f "/drivers/nvidia.conf" ]; then - nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-uvm.conf" ]; then - nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-modeset.conf" ]; then - nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') - fi - if [ -f "/drivers/nvidia-peermem.conf" ]; then - nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') - fi - - local config="DRIVER_VERSION=${DRIVER_VERSION} + cat < "$config_file" + echo "Driver configuration stored at $config_file" +} - echo "$config" +_install_userspace_components() { + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null } -_store_driver_config() { - local config_file="/run/nvidia/driver-config.state" - echo "Storing driver configuration state..." - _build_driver_config > "$config_file" - echo "Driver configuration stored at $config_file" +_copy_kernel_module_sources() { + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} + cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest +} + +_wait_for_signal() { + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 } init() { @@ -716,96 +733,41 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - # Fast path: if the NVIDIA kernel modules are already loaded and driver config matches, - # skip kernel module build/load but install userspace components. - # This handles non-clean restarts where modules are in use and can't be unloaded. - if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then - current_config=$(_build_driver_config) - stored_config=$(cat /run/nvidia/driver-config.state) - - if [ "${current_config}" = "${stored_config}" ]; then - echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" - - # Skip kernel module unload since they're already loaded with correct version - # Unmount any existing rootfs - _unmount_rootfs - - # Update package cache for userspace install - _update_package_cache - _resolve_kernel_version || exit 1 - _install_prerequisites - - # Install userspace components only (libraries, binaries) - # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install - echo "Installing userspace components (libraries and binaries)..." - cd /drivers - # Extract the driver first - sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x - cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} - ./nvidia-installer \ - --silent \ - --no-kernel-module \ - --no-nouveau-check \ - --no-nvidia-modprobe \ - --no-drm \ - --no-peermem - - # Determine the kernel module type - _resolve_kernel_type || exit 1 - - # Copy the kernel module sources for sidecar containers (gdrcopy, nvidia-fs, etc.) - mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ - cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ && \ - sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest - - # Mount the driver rootfs to make components available - _mount_rootfs - - # Ensure persistence daemon is running - _ensure_persistenced - - # Write kernel update hook - _write_kernel_update_hook - - # Store driver configuration - _store_driver_config - - echo "Userspace-only install complete, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 - fi - fi + # Fast path: if NVIDIA kernel modules are already loaded and config matches, + # skip kernel module build/load and only reinstall userspace components. + # This handles non-clean restarts where modules are in use and can't be unloaded. + if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then + current_config=$(_build_driver_config) + stored_config=$(cat /run/nvidia/driver-config.state) + + if [ "${current_config}" = "${stored_config}" ]; then + echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" + _unmount_rootfs + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + _install_userspace_components + _resolve_kernel_type || exit 1 + _copy_kernel_module_sources + _mount_rootfs + _ensure_persistenced + _write_kernel_update_hook + _store_driver_config + echo "Userspace-only install complete" + _wait_for_signal + fi + fi + # Full install path: unload existing driver and perform complete installation _unload_driver || exit 1 _unmount_rootfs - - # Install the userspace components - sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ - cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ - ./nvidia-installer --silent \ - --no-kernel-module \ - --no-nouveau-check \ - --no-nvidia-modprobe \ - --no-rpms \ - --no-backup \ - --no-check-for-alternate-installs \ - --no-libglx-indirect \ - --no-install-libglvnd \ - --x-prefix=/tmp/null \ - --x-module-path=/tmp/null \ - --x-library-path=/tmp/null \ - --x-sysconfig-path=/tmp/null - - # Determine the kernel module type + _install_userspace_components _resolve_kernel_type || exit 1 - # Copy the kernel module sources - mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ - mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \ - sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest + # Move (not copy) kernel module sources since this is the full install path + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest if _kernel_requires_package; then _update_ca_certificates @@ -814,8 +776,6 @@ init() { _resolve_kernel_version || exit 1 _install_prerequisites _create_driver_package - #_remove_prerequisites - #_cleanup_package_cache fi _install_driver @@ -823,13 +783,7 @@ init() { _mount_rootfs _write_kernel_update_hook _store_driver_config - - echo "Done, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _wait_for_signal } update() { From 34e89fc0c6bed7112b55a71c6b59ca82fb40b36f Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Thu, 11 Dec 2025 21:30:40 +0000 Subject: [PATCH 7/7] refactor: address PR review comments for driver install scripts Signed-off-by: Karthik Vetrivel --- rhel9/common.sh | 9 +-- rhel9/nvidia-driver | 37 ++++++------ ubuntu22.04/nvidia-driver | 119 +++++++++++++++++++------------------- 3 files changed, 83 insertions(+), 82 deletions(-) diff --git a/rhel9/common.sh b/rhel9/common.sh index 22d81857..1f3ffa95 100755 --- a/rhel9/common.sh +++ b/rhel9/common.sh @@ -53,14 +53,15 @@ _read_conf_file() { } # Build driver configuration for state comparison +# Note: Variables are expected to be set by the sourcing script (nvidia-driver) _build_driver_config() { cat < /dev/null" EXIT - fi + trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT echo "Compiling NVIDIA driver kernel modules..." cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} @@ -444,6 +442,21 @@ _unload_driver() { fi fi + if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then + echo "Stopping NVIDIA topology daemon..." + local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA topology daemon" >&2 + return 1 + fi + fi + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then echo "Stopping NVIDIA fabric manager daemon..." local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) @@ -664,19 +677,6 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -_ensure_persistence() { - local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid - if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then - return 0 - fi - - if command -v nvidia-persistenced >/dev/null 2>&1; then - nvidia-persistenced --persistence-mode || true - else - echo "nvidia-persistenced not found; continuing without persistence" - fi -} - _start_daemons() { echo "Starting NVIDIA persistence daemon..." nvidia-persistenced --persistence-mode @@ -719,7 +719,7 @@ _start_daemons() { } _store_driver_config() { - local config_file="/run/nvidia/driver-config.state" + local config_file="${RUN_DIR}/driver-config.state" echo "Storing driver configuration state..." _build_driver_config > "$config_file" echo "Driver configuration stored at $config_file" @@ -737,7 +737,6 @@ _wait_for_signal() { _userspace_only_install() { echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" _unmount_rootfs - _update_package_cache cd /drivers [ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 6e42cada..f645aa2e 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -8,6 +8,7 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver NUM_VGPU_DEVICES=0 +DRIVER_TYPE="${DRIVER_TYPE:-passthrough}" GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" USE_HOST_MOFED="${USE_HOST_MOFED:-false}" NVIDIA_MODULE_PARAMS=() @@ -344,44 +345,7 @@ _load_driver() { set +o xtrace -o nounset fi - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode - - if [ "${DRIVER_TYPE}" = "vgpu" ]; then - echo "Copying gridd.conf..." - cp /drivers/gridd.conf /etc/nvidia/gridd.conf - if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then - echo "Copying ClientConfigToken..." - mkdir -p /etc/nvidia/ClientConfigToken/ - cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ - fi - - echo "Starting nvidia-gridd.." - LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd - - # Start virtual topology daemon - _start_vgpu_topology_daemon - fi - - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _start_daemons return 0 } @@ -425,6 +389,21 @@ _unload_driver() { fi fi + if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then + echo "Stopping NVIDIA topology daemon..." + local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA topology daemon" >&2 + return 1 + fi + fi + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then echo "Stopping NVIDIA fabric manager daemon..." local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) @@ -639,16 +618,44 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -_ensure_persistenced() { - local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid - if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then - return 0 +_start_daemons() { + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon fi - if command -v nvidia-persistenced >/dev/null 2>&1; then - nvidia-persistenced --persistence-mode || true - else - echo "nvidia-persistenced not found; continuing without persistence" + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg fi } @@ -660,7 +667,7 @@ _read_conf_file() { _build_driver_config() { cat < "$config_file" echo "Driver configuration stored at $config_file" @@ -700,9 +707,9 @@ _install_userspace_components() { --x-sysconfig-path=/tmp/null } -_copy_kernel_module_sources() { +_move_kernel_module_sources() { mkdir -p /usr/src/nvidia-${DRIVER_VERSION} - cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest } @@ -743,14 +750,11 @@ init() { if [ "${current_config}" = "${stored_config}" ]; then echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" _unmount_rootfs - _update_package_cache - _resolve_kernel_version || exit 1 - _install_prerequisites _install_userspace_components _resolve_kernel_type || exit 1 - _copy_kernel_module_sources + _move_kernel_module_sources _mount_rootfs - _ensure_persistenced + _start_daemons _write_kernel_update_hook _store_driver_config echo "Userspace-only install complete" @@ -764,10 +768,7 @@ init() { _install_userspace_components _resolve_kernel_type || exit 1 - # Move (not copy) kernel module sources since this is the full install path - mkdir -p /usr/src/nvidia-${DRIVER_VERSION} - mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ - sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest + _move_kernel_module_sources if _kernel_requires_package; then _update_ca_certificates