Skip to content

Commit 8977e19

Browse files
Merge pull request #27 from NVIDIA/magzhang/0.7.0-updates
0.7.0 - Updated to k8s 1.29 and GPU Operator v23.9.2
2 parents b388bcd + 59b7fcd commit 8977e19

File tree

8 files changed

+33
-30
lines changed

8 files changed

+33
-30
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Each CSP has its own end of life date for the versions of Kubernetes they suppor
2424

2525
| Version | Release Date | Kubernetes Versions | NVIDIA GPU Operator | NVIDIA Data Center Driver* | End of Life |
2626
| :--- | :--- | :--- | :--- | :--- | :--- |
27+
| 0.7.0 | April 2024 | EKS - 1.29 <br> GKE - 1.29 <br> AKS - 1.29 | 23.9.2 (Default); 23.9.2 (NV AI E) | 550.54.15 (EKS & GKE Default); 550.54.15 (NV AI E version for GKE & EKS) | EKS - Mar 2025 <br> GKE - Mar 2025 <br> AKS - Not Specified |
2728
| 0.6.0 | January 2024 | EKS - 1.28 <br> GKE - 1.28 <br> AKS - 1.28 | 23.9.1 (Default); 23.9.0 (NV AI E) | 535.129.03 (EKS & GKE Default); 535.129.03 (NV AI E version for GKE & EKS) | EKS - Nov 2024 <br> GKE - Nov 2024 <br> AKS - Nov 2024 |
2829
| 0.5.0 | November 2023 | EKS - 1.27 <br> GKE - 1.27 <br> AKS - 1.27 | 23.6.1 (Default); 23.3.2 (NV AI E) | 535.104.05 (EKS & GKE Default); 525.125.06 (NV AI E version for GKE & EKS) | EKS - July 2024 <br> GKE - August 2024 <br> AKS - July 2024 |
2930
| 0.4.0 | October 2023 | EKS - 1.27 <br> GKE - 1.27 <br> AKS - 1.27 | 23.6.1 (Default); 23.3.2 (NV AI E) | 535.104.05 (EKS & GKE Default); 525.125.06 (NV AI E version for GKE & EKS) | EKS - July 2024 <br> GKE - August 2024 <br> AKS - July 2024 |

aks/examples/cnpack/README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,15 @@
2424
- Add `fluentbit_workspace_name`. This will create Azure Log Analytics Workspace with the specified name.
2525
- Add `prometheus_name`. This will create Azure Monitor Workspace with the specified name.
2626

27-
2. Run `terraform plan -out tfplan` and validate that the output is correct
27+
2. Run `terraform init`
2828

29-
3. Run `terraform apply tfplan`
29+
3. Run `terraform plan -out tfplan` and validate that the output is correct
3030

31-
4. The `terraform output` of this module can be used immediately within the configuration file of CNPack
31+
4. Run `terraform apply tfplan`
3232

33-
5. Run `terraform destroy` to delete all resources created by this module.
33+
5. The `terraform output` of this module can be used immediately within the configuration file of CNPack
34+
35+
6. Once you're done, run `terraform state rm module.holoscan-ready-aks.helm_release.gpu-operator` and `terraform state rm module.holoscan-ready-aks.kubernetes_namespace_v1.gpu-operator`. Lastly, run `terraform destroy` to delete all resources created by this module.
3436

3537
**Note**
3638
The `log_analytics_workspace_primary_shared_key` used for Fluentbit is a sensitive variable and should be protected like a password

aks/terraform.tfvars

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
# gpu_node_pool_max_count = 5
2020
# gpu_node_pool_min_count = 2
2121
# gpu_operator_namespace = "gpu-operator"
22-
# gpu_operator_version = "v23.9.1"
22+
# gpu_operator_version = "v23.9.2"
2323
# gpu_os_sku = "Ubuntu"
24-
# kubernetes_version = "1.28"
24+
# kubernetes_version = "1.29"
2525
# location = ""
2626
# nvaie = false
27-
# nvaie_gpu_operator_version = "v23.9.0"
27+
# nvaie_gpu_operator_version = "v23.9.2"

aks/variables.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ variable "cluster_name" {
2525
}
2626

2727
variable "kubernetes_version" {
28-
default = "1.28"
28+
default = "1.29"
2929
description = "Version of Kubernetes to turn on. Run 'az aks get-versions --location <location> --output table' to view all available versions "
3030
}
3131

@@ -87,7 +87,7 @@ variable "gpu_os_sku" {
8787
GPU Operator Variables
8888
****************************/
8989
variable "gpu_operator_version" {
90-
default = "v23.9.1"
90+
default = "v23.9.2"
9191
description = "Version of the GPU operator to be installed"
9292
}
9393

@@ -105,7 +105,7 @@ variable "nvaie" {
105105

106106
variable "nvaie_gpu_operator_version" {
107107
type = string
108-
default = "v23.9.0"
108+
default = "v23.9.2"
109109
description = "The NVIDIA Driver version of GPU Operator. Overrides `gpu_operator_version` when `nvaie` is set to `true`"
110110
}
111111

eks/terraform.tfvars

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# aws_profile = "development"
1111
# cidr_block = "10.0.0.0/16"
1212
# cluster_name = ""
13-
# cluster_version = "1.28"
13+
# cluster_version = "1.29"
1414
# cpu_instance_type = "t2.xlarge"
1515
# cpu_node_pool_additional_user_data = ""
1616
# cpu_node_pool_delete_on_termination = true
@@ -28,16 +28,16 @@
2828
# gpu_node_pool_delete_on_termination = true
2929
# gpu_node_pool_root_disk_size_gb = 512
3030
# gpu_node_pool_root_volume_type = "gp2"
31-
# gpu_operator_driver_version = "535.129.03"
31+
# gpu_operator_driver_version = "550.54.15"
3232
# gpu_operator_namespace = "gpu-operator"
33-
# gpu_operator_version = "v23.9.1"
33+
# gpu_operator_version = "v23.9.2"
3434
# max_cpu_nodes = "2"
3535
# max_gpu_nodes = "5"
3636
# min_cpu_nodes = "0"
3737
# min_gpu_nodes = "2"
3838
# nvaie = false
39-
# nvaie_gpu_operator_driver_version = "535.129.03"
40-
# nvaie_gpu_operator_version = "v23.9.0"
39+
# nvaie_gpu_operator_driver_version = "550.54.15"
40+
# nvaie_gpu_operator_version = "v23.9.2"
4141
# private_subnets = [
4242
# "10.0.0.0/19",
4343
# "10.0.32.0/19",

eks/variables.tf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,20 @@ variable "cluster_name" {
2828

2929
variable "cluster_version" {
3030
type = string
31-
default = "1.28"
31+
default = "1.29"
3232
description = "Version of EKS to install on the control plane (Major and Minor version only, do not include the patch)"
3333
}
3434
/************************
3535
GPU Operator Variables
3636
*************************/
3737
variable "gpu_operator_version" {
38-
default = "v23.9.1"
38+
default = "v23.9.2"
3939
description = "Version of the GPU Operator to deploy. Defaults to latest available. Not set when `nvaie` is set to `true`"
4040
}
4141

4242
variable "gpu_operator_driver_version" {
4343
type = string
44-
default = "535.129.03"
44+
default = "550.54.15"
4545
description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available. Not set when `nvaie` is set to true"
4646
}
4747

@@ -59,13 +59,13 @@ variable "nvaie" {
5959

6060
variable "nvaie_gpu_operator_version" {
6161
type = string
62-
default = "v23.9.0"
62+
default = "v23.9.2"
6363
description = "The NVIDIA Driver version of GPU Operator. Overrides `gpu_operator_version` when `nvaie` is set to `true`"
6464
}
6565

6666
variable "nvaie_gpu_operator_driver_version" {
6767
type = string
68-
default = "535.129.03"
68+
default = "550.54.15"
6969
description = "The NVIDIA AI Enterprise version of the NVIDIA driver to be installed with the GPU operator. Overrides `gpu_operator_driver_version` when `nvaie` is set to `true`"
7070
}
7171
/*****************************

gke/terraform.tfvars

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,18 @@
1414
# gpu_instance_type = "n1-standard-4"
1515
# gpu_max_node_count = "5"
1616
# gpu_min_node_count = "2"
17-
# gpu_operator_driver_version = "535.129.03"
17+
# gpu_operator_driver_version = "550.54.15"
1818
# gpu_operator_namespace = "gpu-operator"
19-
# gpu_operator_version = "v23.9.1"
19+
# gpu_operator_version = "v23.9.2"
2020
# gpu_type = "nvidia-tesla-v100"
21-
# min_master_version = "1.28"
21+
# min_master_version = "1.29"
2222
# network = ""
2323
# node_zones = ""
2424
# num_cpu_nodes = 1
2525
# num_gpu_nodes = 2
2626
# nvaie = false
27-
# nvaie_gpu_operator_driver_version = "535.129.03"
28-
# nvaie_gpu_operator_version = "v23.9.0"
27+
# nvaie_gpu_operator_driver_version = "550.54.15"
28+
# nvaie_gpu_operator_version = "v23.9.2"
2929
# project_id = ""
3030
# region = ""
3131
# release_channel = "REGULAR"

gke/variables.tf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ variable "release_channel" {
4949
}
5050

5151
variable "min_master_version" {
52-
default = "1.28"
52+
default = "1.29"
5353
description = "The minimum cluster version of the master."
5454
}
5555

@@ -133,13 +133,13 @@ variable "disk_size_gb" {
133133
GPU Operator Variables
134134
***************************/
135135
variable "gpu_operator_version" {
136-
default = "v23.9.1"
136+
default = "v23.9.2"
137137
description = "Version of the GPU Operator to deploy. Defaults to latest available. Not set when `nvaie` is set to `true`"
138138
}
139139

140140
variable "gpu_operator_driver_version" {
141141
type = string
142-
default = "535.129.03"
142+
default = "550.54.15"
143143
description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available. Not set when `nvaie` is set to true"
144144
}
145145

@@ -157,12 +157,12 @@ variable "nvaie" {
157157

158158
variable "nvaie_gpu_operator_version" {
159159
type = string
160-
default = "v23.9.0"
160+
default = "v23.9.2"
161161
description = "The NVIDIA Driver version of GPU Operator. Overrides `gpu_operator_version` when `nvaie` is set to `true`"
162162
}
163163

164164
variable "nvaie_gpu_operator_driver_version" {
165165
type = string
166-
default = "535.129.03"
166+
default = "550.54.15"
167167
description = "The NVIDIA AI Enterprise version of the NVIDIA driver to be installed with the GPU operator. Overrides `gpu_operator_driver_version` when `nvaie` is set to `true`"
168168
}

0 commit comments

Comments
 (0)