Skip to content

Commit 11cce5a

Browse files
authored
Merge branch 'develop' into develop
2 parents 311dcae + 0993d8c commit 11cce5a

File tree

20 files changed

+1025
-7
lines changed

20 files changed

+1025
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1515
- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting.
1616
- Upgrade DCV to version 2024.0-19030.
1717
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
18+
- Install nvidia-imex for all OSs except AL2.
1819

1920
**BUG FIXES**
2021
- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures.
@@ -38,6 +39,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3839
- Libfabric-aws: libfabric-aws-2.1.0-1
3940
- Rdma-core: rdma-core-57.0-1
4041
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6
42+
- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2.
4143

4244
**BUG FIXES**
4345
- Fix a bug in the installation of ARM Performance Library that was causing the build image fail in isolated environments.

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616

1717
# NVidia
1818
default['cluster']['nvidia']['enabled'] = 'no'
19-
default['cluster']['nvidia']['driver_version'] = '570.86.15'
19+
default['cluster']['nvidia']['driver_version'] = '570.172.08'
2020
default['cluster']['nvidia']['dcgm_version'] = '3.3.6'
2121
if platform?('amazon') && node['platform_version'] == "2"
2222
default['cluster']['nvidia']['driver_version'] = '550.127.08'
2323
end
2424

25+
# nvidia-imex
26+
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
27+
2528
# DCV
2629
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"
2730
default['cluster']['dcv']['authenticator']['user_id'] = node['cluster']['reserved_base_uid'] + 3

cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,19 @@ def is_process_running(process_name)
1818

1919
!ps.stdout.strip.empty?
2020
end
21+
22+
#
23+
# Get Count of GPUs in instance
24+
#
25+
def get_nvswitch_count(device_id)
26+
shell_out("lspci -d #{device_id} | wc -l").stdout.strip.to_i
27+
end
28+
29+
def get_device_ids
30+
# A100 (P4), H100(P5), B200(P6) and GB200()p6e) systems have NVSwitches
31+
# NVSwitch device id is 10de:1af1 for P4 instance
32+
# NVSwitch device id is 10de:22a3 for P5 instance
33+
# NVSwitch device id is 10de:2901 for P6 instance
34+
# NVSwitch device id is 10de:2941 for P6e instance
35+
{ 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' }
36+
end

cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,7 @@
2424
end
2525

2626
include_recipe "aws-parallelcluster-platform::nvidia_uvm"
27+
28+
nvidia_imex 'Configure nvidia-imex' do
29+
action :configure
30+
end

cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,5 @@
2424
fabric_manager 'Install Nvidia Fabric Manager'
2525

2626
nvidia_dcgm 'install Nvidia datacenter-gpu-manager'
27+
28+
nvidia_imex 'Install nvidia-imex'

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,8 @@ def _nvidia_driver_version
5454

5555
# Get number of nv switches
5656
def get_nvswitches
57-
# A100 (P4), H100(P5) and B200(P6) systems have NVSwitches
58-
# NVSwitch device id is 10de:1af1 for P4 instance
59-
# NVSwitch device id is 10de:22a3 for P5 instance
60-
# NVSwitch device id is 10de:2901 for P6 instance
6157
# We sum the count for all these deviceIds as output of lscpi command will be >0
6258
# for only one device ID based on the instance type
63-
nvswitch_device_ids = ['10de:1af1', '10de:22a3', '10de:2901']
64-
nvswitch_device_ids.sum { |id| shell_out("lspci -d #{id} | wc -l").stdout.strip.to_i }
59+
nvswitch_device_ids = get_device_ids.values
60+
nvswitch_device_ids.sum { |id| get_nvswitch_count(id) }
6561
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'amazon' do |node|
16+
node['platform_version'].to_i == 2023
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
20+
use 'partial/_nvidia_imex_rhel.rb'
21+
22+
def platform
23+
"amzn#{node['platform_version'].to_i}"
24+
end
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'amazon', platform_version: '2'
16+
17+
use 'partial/_nvidia_imex_common.rb'
18+
use 'partial/_nvidia_imex_rhel.rb'
19+
20+
def imex_installed?
21+
# We do not install NVIDIA-Imex for Alinux2 due to restriction on NVIDIA driver
22+
true
23+
end
24+
25+
action :configure do
26+
# Do nothing
27+
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'redhat' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
20+
use 'partial/_nvidia_imex_rhel.rb'
21+
22+
def platform
23+
"rhel#{node['platform_version'].to_i}"
24+
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_imex, platform: 'rocky' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_nvidia_imex_common.rb'
20+
use 'partial/_nvidia_imex_rhel.rb'
21+
22+
def platform
23+
"rhel#{node['platform_version'].to_i}"
24+
end

0 commit comments

Comments
 (0)