Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added cmd/cli/.main.go.swp
Binary file not shown.
4 changes: 2 additions & 2 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ func (p *Provisioner) waitForNodeReboot() error {
}

// Wait for the node to come back up
maxRetries := 30
retryInterval := 10 * time.Second
maxRetries := 10
retryInterval := 30 * time.Second

for i := 0; i < maxRetries; i++ {
p.log.Info("Waiting for node to come back online...")
Expand Down
46 changes: 27 additions & 19 deletions pkg/provisioner/templates/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,25 +101,28 @@ sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
export KUBECONFIG="${HOME}/.kube/config"

# Wait explicitly for kube-apiserver availability
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG version

# Install Calico
# based on https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml

# Wait for Tigera operator to be ready
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=available --timeout=300s deployment/tigera-operator -n tigera-operator
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=available --timeout=300s deployment/tigera-operator -n tigera-operator

# Wait for all necessary CRDs to be established
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/installations.operator.tigera.io
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/apiservers.operator.tigera.io
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/tigerastatuses.operator.tigera.io
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/installations.operator.tigera.io
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/apiservers.operator.tigera.io
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/tigerastatuses.operator.tigera.io

# Apply custom resources with increased retry attempts
with_retry 10 20s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml

# Make single-node cluster schedulable
kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule-
kubectl label node --all node-role.kubernetes.io/worker=
kubectl label node --all nvidia.com/holodeck.managed=true
with_retry 10 30s kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule-
with_retry 10 30s kubectl label node --all node-role.kubernetes.io/worker=
with_retry 10 30s kubectl label node --all nvidia.com/holodeck.managed=true

# Wait for cluster to be ready
with_retry 10 30s kubectl --kubeconfig $KUBECONFIG wait --for=condition=ready --timeout=300s nodes --all
Expand Down Expand Up @@ -168,11 +171,14 @@ echo "ssh -i <your-private-key> ubuntu@${INSTANCE_ENDPOINT_HOST}"

const microk8sTemplate = `
: ${INSTANCE_ENDPOINT_HOST:={{.K8sEndpointHost}}}
: ${K8S_VERSION:={{.Version}}}

# Remove leading 'v' from version if present for microk8s snap channel
MICROK8S_VERSION="${K8S_VERSION#v}"

# Install microk8s
sudo apt-get update

sudo snap install microk8s --classic --channel={{.Version}}
sudo snap install microk8s --classic --channel=${MICROK8S_VERSION}
sudo microk8s enable gpu dashboard dns registry
sudo usermod -a -G microk8s ubuntu
mkdir -p ~/.kube
Expand All @@ -181,7 +187,7 @@ sudo microk8s config > ~/.kube/config
sudo chown -f -R ubuntu ~/.kube
sudo snap alias microk8s.kubectl kubectl

echo "Microk8s {{.Version}} installed successfully"
echo "Microk8s ${MICROK8S_VERSION} installed successfully"
echo "you can now access the cluster with:"
echo "ssh -i <your-private-key> ubuntu@${INSTANCE_ENDPOINT_HOST}"
`
Expand Down Expand Up @@ -269,14 +275,16 @@ type KubeadmConfig struct {
}

func NewKubernetes(env v1alpha1.Environment) (*Kubernetes, error) {
kubernetes := &Kubernetes{
Version: env.Spec.Kubernetes.KubernetesVersion,
}
// check if env.Spec.Kubernetes.KubernetesVersion is in the format of vX.Y.Z
// if not, set the default version
if !strings.HasPrefix(env.Spec.Kubernetes.KubernetesVersion, "v") && env.Spec.Kubernetes.KubernetesInstaller != "microk8s" {
fmt.Printf("Kubernetes version %s is not in the format of vX.Y.Z, setting default version v1.32.1\n", env.Spec.Kubernetes.KubernetesVersion)
kubernetes := &Kubernetes{}

// Normalize Kubernetes version: always ensure it starts with 'v'
switch {
case env.Spec.Kubernetes.KubernetesVersion == "":
kubernetes.Version = defaultKubernetesVersion
case !strings.HasPrefix(env.Spec.Kubernetes.KubernetesVersion, "v"):
kubernetes.Version = "v" + env.Spec.Kubernetes.KubernetesVersion
default:
kubernetes.Version = env.Spec.Kubernetes.KubernetesVersion
}
if env.Spec.Kubernetes.KubeletReleaseVersion != "" {
kubernetes.KubeletReleaseVersion = env.Spec.Kubernetes.KubeletReleaseVersion
Expand Down
Loading