diff --git a/CHANGELOG.md b/CHANGELOG.md index 384e2e4789..18287a6318 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,20 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Ubuntu 20.04 is no longer supported. -- Upgrade Slurm to version 24.11.5. +- Upgrade Slurm to version 24.11.6 (from 24.05.8). +- Upgrade EFA installer to 1.42.0 (from 1.41.0). + - Efa-driver: efa-2.15.3-1 + - Efa-config: efa-config-1.18-1 + - Efa-profile: efa-profile-1.7-1 + - Libfabric-aws: libfabric-aws-2.1.0-3 + - Rdma-core: rdma-core-57.0-1 + - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11 +- Upgrade Cinc Client to version to 18.4.12 from 18.2.7. +- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2. +- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2. +- Upgrade DCGM to version 4.2.3 (from 3.3.6) for all OSs except AL2. +- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2. +- Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1). - Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. diff --git a/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb b/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb index 7b877b10f5..5b445b7cd8 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb @@ -13,7 +13,7 @@ pyenv_dir = "#{base_dir}/pyenv" control 'tag:install_awsbatch_virtualenv_created' do - python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8' + python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11' title "awsbatch virtualenv should be created on #{python_version}" only_if { !os_properties.redhat? } diff --git a/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml b/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml index 3bcf169b8b..d127d77b6b 100644 --- a/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml +++ b/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml @@ -31,7 +31,7 @@ suites: attributes: cluster: custom_node_package: https://github.com/aws/aws-parallelcluster-node/archive/develop.tar.gz - python-version: 3.12.8 + python-version: 3.12.11 node_virtualenv_path: /opt/parallelcluster/pyenv/versions/node_virtualenv - name: fleet_status run_list: diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 366356c30e..77f8da4d51 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -70,8 +70,8 @@ default['cluster']['head_node_private_ip'] = nil -default['cluster']['efa']['version'] = '1.41.0' -default['cluster']['efa']['sha256'] = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f' +default['cluster']['efa']['version'] = '1.42.0' +default['cluster']['efa']['sha256'] = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008' default['cluster']['efs']['version'] = '2.3.1' default['cluster']['efs']['sha256'] = 'ced12f82e76f9740476b63f30c49bd76cc00b6375e12a9f5f7ba852635c49e15' diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb index 1fa04cdcc9..bccd275b8c 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb @@ -2,8 +2,8 @@ # parallelcluster default source dir defined in attributes source_dir = '/opt/parallelcluster/sources' -efa_version = '1.41.0' -efa_checksum = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f' +efa_version = '1.42.0' +efa_checksum = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008' class ConvergeEfa def self.setup(chef_run, efa_version: nil, efa_checksum: nil) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb index 132c3a94ed..ff4b88c50b 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb @@ -13,7 +13,7 @@ pyenv_dir = "#{base_dir}/pyenv" control 'tag:install_cfnbootstrap_virtualenv_created' do - cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8' + cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11' title "cfnbootstrap virtualenv should be created on #{cfn_python_version}" only_if { !os_properties.redhat_on_docker? } diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 8201c2c04e..533cde463e 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -17,9 +17,10 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' default['cluster']['nvidia']['driver_version'] = '570.172.08' -default['cluster']['nvidia']['dcgm_version'] = '3.3.6' +default['cluster']['nvidia']['dcgm_version'] = '4.2.3-2' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['nvidia']['driver_version'] = '550.127.08' + default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1' end # nvidia-imex diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index 3c7ba588bb..a311ab0ba9 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -20,9 +20,9 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux cuda_version = '12.8' -cuda_patch = '0' +cuda_patch = '1' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '570.86.10' +cuda_version_suffix = '570.124.06' cuda_samples_version = '12.8' if platform?('amazon') && node['platform_version'] == "2" cuda_version = '12.4' diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb index 85917d3f6b..b868b3ff69 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb @@ -16,7 +16,7 @@ # limitations under the License. intelmpi_supported = !arm_instance? -intelmpi_version = '2021.13' +intelmpi_version = '2021.16' node.default['conditions']['intel_mpi_supported'] = intelmpi_supported node.default['cluster']['intelmpi']['version'] = intelmpi_version @@ -25,9 +25,9 @@ return unless intelmpi_supported -intelmpi_full_version = "#{intelmpi_version}.1.769" +intelmpi_full_version = "#{intelmpi_version}.0.443" intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}" -intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh" +intelmpi_installer = "intel-mpi-#{intelmpi_full_version}_offline.sh" intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}" intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}" intelmpi_qt_version = '6.5.3' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb index 5ca316daad..e4882101ad 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -13,34 +13,45 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do - remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do - source "#{dcgm_url}" - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing - end + packages_urls_list = if package_version.start_with?("3.") + [dcgm_package] + else + [dcgm4_core_package, dcgm4_package] + end + packages_urls_list.each do |package| + remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end - bash "Install #{dcgm_package}" do - user 'root' - cwd node['cluster']['sources_dir'] - code <<-DCGM_INSTALL - set -e - dpkg -i #{dcgm_package}-#{package_version}.deb - DCGM_INSTALL - retries 3 - retry_delay 5 + bash "Install #{package}" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-DCGM_INSTALL + set -e + dpkg -i #{package}-#{package_version}.deb + DCGM_INSTALL + retries 3 + retry_delay 5 + end end end -def dcgm_url - "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb" -end - def dcgm_package 'datacenter-gpu-manager' end +def dcgm4_package + "#{dcgm_package}-4-cuda12" +end + +def dcgm4_core_package + "#{dcgm_package}-4-core" +end + def arch_suffix arm_instance? ? 'arm64' : 'amd64' end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb index 997762acd1..c22f791e39 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb @@ -13,34 +13,47 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do - remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do - source "#{dcgm_url}" - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing + if package_version.start_with?("3.") + packages_urls_list = [dcgm_package] + package_url_separator = "-" + else + packages_urls_list = [dcgm4_core_package, dcgm4_package] + package_url_separator = "." end + packages_urls_list.each do |package| + remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.rpm" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}-#{package_version}#{package_url_separator}#{arch_suffix}.rpm" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end - bash "Install #{dcgm_package}" do - user 'root' - cwd node['cluster']['sources_dir'] - code <<-DCGM_INSTALL - set -e - yum install -y #{dcgm_package}-#{package_version}.rpm - DCGM_INSTALL - retries 3 - retry_delay 5 + bash "Install #{package}" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-DCGM_INSTALL + set -e + yum install -y #{package}-#{package_version}.rpm + DCGM_INSTALL + retries 3 + retry_delay 5 + end end end -def dcgm_url - "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm" -end - def dcgm_package 'datacenter-gpu-manager' end +def dcgm4_package + "#{dcgm_package}-4-cuda12" +end + +def dcgm4_core_package + "#{dcgm_package}-4-core" +end + def arch_suffix arm_instance? ? 'aarch64' : 'x86_64' end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 297d1ae932..27a001ff05 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -2,9 +2,9 @@ describe 'aws-parallelcluster-platform::cuda' do cached(:cuda_version) { '12.8' } - cached(:cuda_patch) { '0' } + cached(:cuda_patch) { '1' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '570.86.10' } + cached(:cuda_version_suffix) { '570.124.06' } context 'when nvidia not enabled' do cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb index f74bc6fb08..1023c7ec45 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb @@ -20,8 +20,8 @@ end it 'fetches intel mpi installer script' do - is_expected.to create_remote_file("#{source_dir}/l_mpi_oneapi_p_2021.13.1.769_offline.sh").with( - source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/l_mpi_oneapi_p_2021.13.1.769_offline.sh", + is_expected.to create_remote_file("#{source_dir}/intel-mpi-2021.16.0.443_offline.sh").with( + source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/intel-mpi-2021.16.0.443_offline.sh", mode: '0744', retries: 3, retry_delay: 5 @@ -31,25 +31,25 @@ it 'installs intel mpi' do is_expected.to run_bash('install intel mpi').with( cwd: source_dir, - creates: '/opt/intel/mpi/2021.13' - ).with_code(%r{chmod +x l_mpi_oneapi_p_2021.13.1.769_offline.sh --remove-extracted-files yes -a --silent --eula accept --install-dir /opt/intel}) - .with_code(/rm -f l_mpi_oneapi_p_2021.13.1.769_offline.sh/) + creates: '/opt/intel/mpi/2021.16' + ).with_code(%r{chmod +x intel-mpi-2021.16.0.443_offline.sh --remove-extracted-files yes -a --silent --eula accept --install-dir /opt/intel}) + .with_code(/rm -f intel-mpi-2021.16.0.443_offline.sh/) end it 'appends intel module file dir to modules config' do is_expected.to append_to_config_modules('append intel modules file dir to modules conf') - .with_line('/opt/intel/mpi/2021.13/etc/modulefiles/') + .with_line('/opt/intel/mpi/2021.16/etc/modulefiles/') end it 'renames intel mpi module' do is_expected.to run_execute('rename intel mpi modules file name').with( - command: "mv /opt/intel/mpi/2021.13/etc/modulefiles/mpi /opt/intel/mpi/2021.13/etc/modulefiles/intelmpi", - creates: '/opt/intel/mpi/2021.13/etc/modulefiles/intelmpi' + command: "mv /opt/intel/mpi/2021.16/etc/modulefiles/mpi /opt/intel/mpi/2021.16/etc/modulefiles/intelmpi", + creates: '/opt/intel/mpi/2021.16/etc/modulefiles/intelmpi' ) end it 'adds Qt source file' do - is_expected.to create_template("/opt/intel/mpi/2021.13/qt_source_code.txt").with( + is_expected.to create_template("/opt/intel/mpi/2021.16/qt_source_code.txt").with( source: 'intel_mpi/qt_source_code.erb', owner: 'root', group: 'root', diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb index 3cf2779901..08e45803d1 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb @@ -170,7 +170,8 @@ def self.setup(chef_run, nvidia_enabled: nil) end else it 'installs datacenter gpu manager' do - is_expected.to run_bash('Install datacenter-gpu-manager') + is_expected.to run_bash('Install datacenter-gpu-manager-4-core') + is_expected.to run_bash('Install datacenter-gpu-manager-4-cuda12') end end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb index 15ddf1c512..29d8179436 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb @@ -14,8 +14,13 @@ ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? && (!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?)) end - - describe package('datacenter-gpu-manager') do - it { should be_installed } + if os_properties.alinux2? + describe package('datacenter-gpu-manager') do + it { should be_installed } + end + else + describe package('datacenter-gpu-manager-4-cuda12') do + it { should be_installed } + end end end diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index 9af53e5702..03f07ed07d 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -1,5 +1,5 @@ # Python Version -default['cluster']['python-version'] = '3.12.8' +default['cluster']['python-version'] = '3.12.11' default['cluster']['python-major-minor-version'] = '3.12' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['python-version'] = '3.9.20' diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 79d0777c59..e14f0ae6f3 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -1,8 +1,8 @@ # Slurm -default['cluster']['slurm']['version'] = '24-11-5-1' +default['cluster']['slurm']['version'] = '24-11-6-1' default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' -default['cluster']['slurm']['sha256'] = 'e1a5547edd212c38b5e3230a284133f777b32746551f094aaa81cc4af375e332' +default['cluster']['slurm']['sha256'] = '282708483326f381eb001a14852a1a82e65e18f37b62b7a5f4936c0ed443b600' default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge default['cluster']['munge']['munge_version'] = '0.5.16'