Skip to content

Upgrade dependencies #3000

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,20 @@ This file is used to list changes made in each version of the AWS ParallelCluste

**CHANGES**
- Ubuntu 20.04 is no longer supported.
- Upgrade Slurm to version 24.11.5.
- Upgrade Slurm to version 24.11.6 (from 24.05.8).
- Upgrade EFA installer to 1.42.0 (from 1.41.0).
- Efa-driver: efa-2.15.3-1
- Efa-config: efa-config-1.18-1
- Efa-profile: efa-profile-1.7-1
- Libfabric-aws: libfabric-aws-2.1.0-3
- Rdma-core: rdma-core-57.0-1
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11
- Upgrade Cinc Client to version to 18.4.12 from 18.2.7.
- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2.
- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2.
- Upgrade DCGM to version 4.2.3 (from 3.3.6) for all OSs except AL2.
- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2.
- Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1).
- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting.
- Upgrade DCV to version 2024.0-19030.
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
pyenv_dir = "#{base_dir}/pyenv"

control 'tag:install_awsbatch_virtualenv_created' do
python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8'
python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11'
title "awsbatch virtualenv should be created on #{python_version}"
only_if { !os_properties.redhat? }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ suites:
attributes:
cluster:
custom_node_package: https://github.com/aws/aws-parallelcluster-node/archive/develop.tar.gz
python-version: 3.12.8
python-version: 3.12.11
node_virtualenv_path: /opt/parallelcluster/pyenv/versions/node_virtualenv
- name: fleet_status
run_list:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@

default['cluster']['head_node_private_ip'] = nil

default['cluster']['efa']['version'] = '1.41.0'
default['cluster']['efa']['sha256'] = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f'
default['cluster']['efa']['version'] = '1.42.0'
default['cluster']['efa']['sha256'] = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008'

default['cluster']['efs']['version'] = '2.3.1'
default['cluster']['efs']['sha256'] = 'ced12f82e76f9740476b63f30c49bd76cc00b6375e12a9f5f7ba852635c49e15'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

# parallelcluster default source dir defined in attributes
source_dir = '/opt/parallelcluster/sources'
efa_version = '1.41.0'
efa_checksum = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f'
efa_version = '1.42.0'
efa_checksum = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008'

class ConvergeEfa
def self.setup(chef_run, efa_version: nil, efa_checksum: nil)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
pyenv_dir = "#{base_dir}/pyenv"

control 'tag:install_cfnbootstrap_virtualenv_created' do
cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8'
cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11'
title "cfnbootstrap virtualenv should be created on #{cfn_python_version}"
only_if { !os_properties.redhat_on_docker? }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
default['cluster']['nvidia']['driver_version'] = '570.172.08'
default['cluster']['nvidia']['dcgm_version'] = '3.3.6'
default['cluster']['nvidia']['dcgm_version'] = '4.2.3-2'
if platform?('amazon') && node['platform_version'] == "2"
default['cluster']['nvidia']['driver_version'] = '550.127.08'
default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1'
end

# nvidia-imex
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
cuda_version = '12.8'
cuda_patch = '0'
cuda_patch = '1'
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '570.86.10'
cuda_version_suffix = '570.124.06'
cuda_samples_version = '12.8'
if platform?('amazon') && node['platform_version'] == "2"
cuda_version = '12.4'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# limitations under the License.

intelmpi_supported = !arm_instance?
intelmpi_version = '2021.13'
intelmpi_version = '2021.16'

node.default['conditions']['intel_mpi_supported'] = intelmpi_supported
node.default['cluster']['intelmpi']['version'] = intelmpi_version
Expand All @@ -25,9 +25,9 @@

return unless intelmpi_supported

intelmpi_full_version = "#{intelmpi_version}.1.769"
intelmpi_full_version = "#{intelmpi_version}.0.443"
intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}"
intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh"
intelmpi_installer = "intel-mpi-#{intelmpi_full_version}_offline.sh"
intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}"
intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}"
intelmpi_qt_version = '6.5.3'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,45 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do
source "#{dcgm_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end
packages_urls_list = if package_version.start_with?("3.")
[dcgm_package]
else
[dcgm4_core_package, dcgm4_package]
end
packages_urls_list.each do |package|
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "Install #{dcgm_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
dpkg -i #{dcgm_package}-#{package_version}.deb
DCGM_INSTALL
retries 3
retry_delay 5
bash "Install #{package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
dpkg -i #{package}-#{package_version}.deb
DCGM_INSTALL
retries 3
retry_delay 5
end
end
end

def dcgm_url
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
end

def dcgm_package
'datacenter-gpu-manager'
end

def dcgm4_package
"#{dcgm_package}-4-cuda12"
end

def dcgm4_core_package
"#{dcgm_package}-4-core"
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,47 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do
source "#{dcgm_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
if package_version.start_with?("3.")
packages_urls_list = [dcgm_package]
package_url_separator = "-"
else
packages_urls_list = [dcgm4_core_package, dcgm4_package]
package_url_separator = "."
end
packages_urls_list.each do |package|
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.rpm" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}-#{package_version}#{package_url_separator}#{arch_suffix}.rpm"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "Install #{dcgm_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
yum install -y #{dcgm_package}-#{package_version}.rpm
DCGM_INSTALL
retries 3
retry_delay 5
bash "Install #{package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
yum install -y #{package}-#{package_version}.rpm
DCGM_INSTALL
retries 3
retry_delay 5
end
end
end

def dcgm_url
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm"
end

def dcgm_package
'datacenter-gpu-manager'
end

def dcgm4_package
"#{dcgm_package}-4-cuda12"
end

def dcgm4_core_package
"#{dcgm_package}-4-core"
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

describe 'aws-parallelcluster-platform::cuda' do
cached(:cuda_version) { '12.8' }
cached(:cuda_patch) { '0' }
cached(:cuda_patch) { '1' }
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
cached(:cuda_version_suffix) { '570.86.10' }
cached(:cuda_version_suffix) { '570.124.06' }

context 'when nvidia not enabled' do
cached(:chef_run) do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
end

it 'fetches intel mpi installer script' do
is_expected.to create_remote_file("#{source_dir}/l_mpi_oneapi_p_2021.13.1.769_offline.sh").with(
source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/l_mpi_oneapi_p_2021.13.1.769_offline.sh",
is_expected.to create_remote_file("#{source_dir}/intel-mpi-2021.16.0.443_offline.sh").with(
source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/intel-mpi-2021.16.0.443_offline.sh",
mode: '0744',
retries: 3,
retry_delay: 5
Expand All @@ -31,25 +31,25 @@
it 'installs intel mpi' do
is_expected.to run_bash('install intel mpi').with(
cwd: source_dir,
creates: '/opt/intel/mpi/2021.13'
).with_code(%r{chmod +x l_mpi_oneapi_p_2021.13.1.769_offline.sh --remove-extracted-files yes -a --silent --eula accept --install-dir /opt/intel})
.with_code(/rm -f l_mpi_oneapi_p_2021.13.1.769_offline.sh/)
creates: '/opt/intel/mpi/2021.16'
).with_code(%r{chmod +x intel-mpi-2021.16.0.443_offline.sh --remove-extracted-files yes -a --silent --eula accept --install-dir /opt/intel})
.with_code(/rm -f intel-mpi-2021.16.0.443_offline.sh/)
end

it 'appends intel module file dir to modules config' do
is_expected.to append_to_config_modules('append intel modules file dir to modules conf')
.with_line('/opt/intel/mpi/2021.13/etc/modulefiles/')
.with_line('/opt/intel/mpi/2021.16/etc/modulefiles/')
end

it 'renames intel mpi module' do
is_expected.to run_execute('rename intel mpi modules file name').with(
command: "mv /opt/intel/mpi/2021.13/etc/modulefiles/mpi /opt/intel/mpi/2021.13/etc/modulefiles/intelmpi",
creates: '/opt/intel/mpi/2021.13/etc/modulefiles/intelmpi'
command: "mv /opt/intel/mpi/2021.16/etc/modulefiles/mpi /opt/intel/mpi/2021.16/etc/modulefiles/intelmpi",
creates: '/opt/intel/mpi/2021.16/etc/modulefiles/intelmpi'
)
end

it 'adds Qt source file' do
is_expected.to create_template("/opt/intel/mpi/2021.13/qt_source_code.txt").with(
is_expected.to create_template("/opt/intel/mpi/2021.16/qt_source_code.txt").with(
source: 'intel_mpi/qt_source_code.erb',
owner: 'root',
group: 'root',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ def self.setup(chef_run, nvidia_enabled: nil)
end
else
it 'installs datacenter gpu manager' do
is_expected.to run_bash('Install datacenter-gpu-manager')
is_expected.to run_bash('Install datacenter-gpu-manager-4-core')
is_expected.to run_bash('Install datacenter-gpu-manager-4-cuda12')
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@
['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? &&
(!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?))
end

describe package('datacenter-gpu-manager') do
it { should be_installed }
if os_properties.alinux2?
describe package('datacenter-gpu-manager') do
it { should be_installed }
end
else
describe package('datacenter-gpu-manager-4-cuda12') do
it { should be_installed }
end
end
end
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Python Version
default['cluster']['python-version'] = '3.12.8'
default['cluster']['python-version'] = '3.12.11'
default['cluster']['python-major-minor-version'] = '3.12'
if platform?('amazon') && node['platform_version'] == "2"
default['cluster']['python-version'] = '3.9.20'
Expand Down
4 changes: 2 additions & 2 deletions cookbooks/aws-parallelcluster-slurm/attributes/versions.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Slurm
default['cluster']['slurm']['version'] = '24-11-5-1'
default['cluster']['slurm']['version'] = '24-11-6-1'
default['cluster']['slurm']['commit'] = ''
default['cluster']['slurm']['branch'] = ''
default['cluster']['slurm']['sha256'] = 'e1a5547edd212c38b5e3230a284133f777b32746551f094aaa81cc4af375e332'
default['cluster']['slurm']['sha256'] = '282708483326f381eb001a14852a1a82e65e18f37b62b7a5f4936c0ed443b600'
default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm"
# Munge
default['cluster']['munge']['munge_version'] = '0.5.16'
Expand Down
Loading