Skip to content
This repository was archived by the owner on Jun 29, 2022. It is now read-only.

Commit b0da2a1

Browse files
authored
Merge pull request #1502 from kinvolk/kai/bare-metal-reprovisioning
baremetal: integrate automated (re-)provisioning logic
2 parents 0876c88 + bbb13a6 commit b0da2a1

File tree

18 files changed

+324
-29
lines changed

18 files changed

+324
-29
lines changed

assets/terraform-modules/bare-metal/flatcar-linux/kubernetes/controller.tf

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,22 @@ module "controller" {
1313
set_standard_hostname = false
1414
clc_snippets = concat(lookup(var.clc_snippets, var.controller_names[count.index], []), [
1515
<<EOF
16+
filesystems:
17+
- name: root
18+
mount:
19+
device: /dev/disk/by-label/ROOT
20+
format: ext4
21+
wipe_filesystem: true
22+
label: ROOT
1623
storage:
1724
files:
25+
- path: /ignition_ran
26+
filesystem: root
27+
mode: 0644
28+
contents:
29+
inline: |
30+
Flag file indicating that Ignition ran.
31+
Should be deleted by the SSH step that checks it.
1832
- path: /etc/hostname
1933
filesystem: root
2034
mode: 0644
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,25 @@
11
module "controller_profile" {
22
source = "../../../matchbox-flatcar"
33
count = length(var.controller_names)
4+
asset_dir = var.asset_dir
45
node_name = var.controller_names[count.index]
56
node_mac = var.controller_macs[count.index]
7+
node_domain = var.controller_domains[count.index]
68
download_protocol = var.download_protocol
79
os_channel = var.os_channel
810
os_version = var.os_version
911
http_endpoint = var.matchbox_http_endpoint
1012
kernel_args = var.kernel_args
1113
kernel_console = var.kernel_console
14+
installer_clc_snippets = lookup(var.installer_clc_snippets, var.controller_names[count.index], [])
1215
install_disk = var.install_disk
1316
install_to_smallest_disk = var.install_to_smallest_disk
1417
container_linux_oem = var.container_linux_oem
1518
ssh_keys = var.ssh_keys
1619
ignition_clc_config = module.controller[count.index].clc_config
1720
cached_install = var.cached_install
1821
wipe_additional_disks = var.wipe_additional_disks
22+
ignore_changes = true
23+
pxe_commands = var.pxe_commands
24+
install_pre_reboot_cmds = var.install_pre_reboot_cmds
1925
}

assets/terraform-modules/bare-metal/flatcar-linux/kubernetes/ssh.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,12 @@ resource "null_resource" "copy-controller-secrets" {
8181
]
8282
}
8383

84+
85+
# Triggered when the Ignition Config changes (used to recreate a controller)
8486
triggers = {
87+
clc_config = module.controller[count.index].clc_config
88+
kernel_console = join(" ", var.kernel_console)
89+
kernel_args = join(" ", var.kernel_args)
8590
etcd_ca_cert = module.bootkube.etcd_ca_cert
8691
etcd_server_cert = module.bootkube.etcd_server_cert
8792
etcd_peer_cert = module.bootkube.etcd_peer_cert

assets/terraform-modules/bare-metal/flatcar-linux/kubernetes/variables.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ variable "clc_snippets" {
6161
default = {}
6262
}
6363

64+
variable "installer_clc_snippets" {
65+
type = map(list(string))
66+
description = "Map from machine names to lists of Container Linux Config snippets, applied for the PXE-booted installer OS"
67+
default = {}
68+
}
69+
6470
variable "labels" {
6571
type = map(string)
6672
description = "Map of labels for worker nodes."
@@ -221,3 +227,15 @@ variable "wipe_additional_disks" {
221227
description = "Wipes any additional disks attached, if set to true"
222228
default = false
223229
}
230+
231+
variable "pxe_commands" {
232+
type = string
233+
default = "echo 'you must (re)provision the node by booting via iPXE from http://MATCHBOX/boot.ipxe'; exit 1"
234+
description = "shell commands to execute for PXE (re)provisioning, with access to the variables $mac (the MAC address), $name (the node name), and $domain (the domain name), e.g., 'bmc=bmc-$domain; ipmitool -H $bmc power off; ipmitool -H $bmc chassis bootdev pxe; ipmitool -H $bmc power on'"
235+
}
236+
237+
variable "install_pre_reboot_cmds" {
238+
type = string
239+
default = "true"
240+
description = "shell commands to execute on the provisioned host after installation finished and before reboot, e.g., docker run --privileged --net host --rm debian sh -c 'apt update && apt install -y ipmitool && ipmitool chassis bootdev disk options=persistent'"
241+
}

assets/terraform-modules/bare-metal/flatcar-linux/kubernetes/worker.tf

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,22 @@ module "worker" {
1212
set_standard_hostname = false
1313
clc_snippets = concat(lookup(var.clc_snippets, var.worker_names[count.index], []), [
1414
<<EOF
15+
filesystems:
16+
- name: root
17+
mount:
18+
device: /dev/disk/by-label/ROOT
19+
format: ext4
20+
wipe_filesystem: true
21+
label: ROOT
1522
storage:
1623
files:
24+
- path: /ignition_ran
25+
filesystem: root
26+
mode: 0644
27+
contents:
28+
inline: |
29+
Flag file indicating that Ignition ran.
30+
Should be deleted by the SSH step that checks it.
1731
- path: /etc/hostname
1832
filesystem: root
1933
mode: 0644
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,24 @@
11
module "worker_profile" {
22
source = "../../../matchbox-flatcar"
33
count = length(var.worker_names)
4+
asset_dir = var.asset_dir
45
node_name = var.worker_names[count.index]
56
node_mac = var.worker_macs[count.index]
7+
node_domain = var.worker_domains[count.index]
68
download_protocol = var.download_protocol
79
os_channel = var.os_channel
810
os_version = var.os_version
911
http_endpoint = var.matchbox_http_endpoint
1012
kernel_args = var.kernel_args
1113
kernel_console = var.kernel_console
14+
installer_clc_snippets = lookup(var.installer_clc_snippets, var.worker_names[count.index], [])
1215
install_disk = var.install_disk
1316
install_to_smallest_disk = var.install_to_smallest_disk
1417
container_linux_oem = var.container_linux_oem
1518
ssh_keys = var.ssh_keys
1619
ignition_clc_config = module.worker[count.index].clc_config
1720
cached_install = var.cached_install
1821
wipe_additional_disks = var.wipe_additional_disks
22+
pxe_commands = var.pxe_commands
23+
install_pre_reboot_cmds = var.install_pre_reboot_cmds
1924
}

assets/terraform-modules/matchbox-flatcar/profiles.tf

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ resource "matchbox_profile" "flatcar-install" {
1818
var.kernel_args,
1919
])
2020

21-
container_linux_config = templatefile("${path.module}/templates/install.yaml.tmpl", {
21+
raw_ignition = data.ct_config.install-ignitions.rendered
22+
}
23+
24+
data "ct_config" "install-ignitions" {
25+
content = templatefile("${path.module}/templates/install.yaml.tmpl", {
2226
os_channel = var.os_channel
2327
os_version = var.os_version
2428
ignition_endpoint = format("%s/ignition", var.http_endpoint)
@@ -29,9 +33,15 @@ resource "matchbox_profile" "flatcar-install" {
2933
kernel_console = join(" ", var.kernel_console)
3034
kernel_args = join(" ", var.kernel_args)
3135
wipe_additional_disks = var.wipe_additional_disks
36+
install_pre_reboot_cmds = var.install_pre_reboot_cmds
3237
# only cached-container-linux profile adds -b baseurl
3338
baseurl_flag = ""
39+
mac_address = var.node_mac
3440
})
41+
42+
pretty_print = false
43+
44+
snippets = var.installer_clc_snippets
3545
}
3646

3747
// Flatcar Container Linux Install profile (from matchbox /assets cache)
@@ -56,7 +66,11 @@ resource "matchbox_profile" "cached-flatcar-linux-install" {
5666
var.kernel_args,
5767
])
5868

59-
container_linux_config = templatefile("${path.module}/templates/install.yaml.tmpl", {
69+
raw_ignition = data.ct_config.cached-install-ignitions.rendered
70+
}
71+
72+
data "ct_config" "cached-install-ignitions" {
73+
content = templatefile("${path.module}/templates/install.yaml.tmpl", {
6074
os_channel = var.os_channel
6175
os_version = var.os_version
6276
ignition_endpoint = format("%s/ignition", var.http_endpoint)
@@ -67,9 +81,15 @@ resource "matchbox_profile" "cached-flatcar-linux-install" {
6781
kernel_console = join(" ", var.kernel_console)
6882
kernel_args = join(" ", var.kernel_args)
6983
wipe_additional_disks = var.wipe_additional_disks
84+
install_pre_reboot_cmds = var.install_pre_reboot_cmds
7085
# profile uses -b baseurl to install from matchbox cache
7186
baseurl_flag = "-b ${var.http_endpoint}/assets/flatcar"
87+
mac_address = var.node_mac
7288
})
89+
90+
pretty_print = false
91+
92+
snippets = var.installer_clc_snippets
7393
}
7494

7595
resource "matchbox_profile" "node" {
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# (executed in-line, #!/... would be ignored)
2+
# Terraform template variable substitution:
3+
name=${name}
4+
domain=${domain}
5+
mac=${mac}
6+
asset_dir=${asset_dir}
7+
ignore_changes=${ignore_changes}
8+
kernel_args="${kernel_args}"
9+
kernel_console="${kernel_console}"
10+
ignition_endpoint="${ignition_endpoint}"
11+
# From now on use $var for dynamic shell substitution
12+
13+
if test -f "$asset_dir/$mac" && [ "$(cat "$asset_dir/$mac")" = "$domain" ]; then
14+
echo "found $asset_dir/$mac containing $domain, skipping PXE install"
15+
node_exists=yes
16+
else
17+
echo "$asset_dir/$mac does not contain $domain, forcing PXE install"
18+
node_exists=no
19+
fi
20+
21+
if [ $node_exists = yes ]; then
22+
if $ignore_changes ; then
23+
echo "Keeping old config because 'ignore_changes' is set."
24+
exit 0
25+
else
26+
# run single commands that can be retried without a side effect in case the connection got disrupted
27+
count=30
28+
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain sudo touch /boot/flatcar/first_boot; do
29+
sleep 1
30+
count=$((count - 1))
31+
done
32+
if [ $count -eq 0 ]; then
33+
echo "error reaching $domain via SSH, please remove the $asset_dir/$mac file to force a PXE install"
34+
exit 1
35+
fi
36+
echo "created the first_boot flag file to reprovision $domain"
37+
count=5
38+
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain "printf 'set linux_append=\"$kernel_args ignition.config.url=$ignition_endpoint?mac=$mac&os=installed\"\\nset linux_console=\"$kernel_console\"\\n' | sudo tee /usr/share/oem/grub.cfg"; do
39+
sleep 1
40+
count=$((count - 1))
41+
done
42+
if [ $count -eq 0 ]; then
43+
echo "error reaching $domain via SSH, please retry"
44+
exit 1
45+
fi
46+
count=5
47+
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain sudo systemctl reboot; do
48+
sleep 1
49+
count=$((count - 1))
50+
done
51+
if [ $count -eq 0 ]; then
52+
echo "error reaching $domain via SSH, please reboot manually"
53+
exit 1
54+
fi
55+
echo "rebooted the $domain"
56+
fi
57+
else
58+
# the user may provide ipmitool commands or any other logic for forcing a PXE boot
59+
${pxe_commands}
60+
fi
61+
62+
echo "checking that $domain comes up"
63+
count=600
64+
# check that we can reach the node and that it has the flag file which we remove here, indicating a reboot happened which prevents a race when issuing the reboot takes longer (both the systemctl reboot and PXE case)
65+
# Just in case the connection breaks and SSH may report an error code but still execute successfully, we will first check file existence and then delete with "rm -f" to be able to rerun both commands.
66+
# This sequence gives us the same error reporting as just running "rm" once.
67+
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain test -f /ignition_ran; do
68+
sleep 1
69+
count=$((count - 1))
70+
done
71+
if [ $count -eq 0 ]; then
72+
echo "error: failed verifying with SSH if $domain came up by checking the /ignition_ran flag file"
73+
exit 1
74+
fi
75+
count=5
76+
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain sudo rm -f /ignition_ran; do
77+
sleep 1
78+
count=$((count - 1))
79+
done
80+
if [ $count -eq 0 ]; then
81+
echo "error: failed to remove the /ignition_ran flag file on $domain"
82+
exit 1
83+
else
84+
echo "$domain came up again"
85+
fi
86+
# only write the state file once the system is up, this allows to rerun lokoctl if the first PXE boot did not work and it will try again
87+
echo $domain > "$asset_dir/$mac"
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
resource "null_resource" "reprovision-node-when-ignition-changes" {
2+
# Triggered when the Ignition Config changes
3+
triggers = {
4+
ignition_config = matchbox_profile.node.raw_ignition
5+
kernel_args = join(" ", var.kernel_args)
6+
kernel_console = join(" ", var.kernel_console)
7+
}
8+
# Wait for the new Ignition config object to be ready before rebooting
9+
depends_on = [matchbox_group.node]
10+
# Trigger running Ignition on the next reboot (first_boot flag file) and reboot the instance, or, if the instance needs to be (re)provisioned, run external commands for PXE booting (also runs on the first provisioning)
11+
provisioner "local-exec" {
12+
command = templatefile("${path.module}/pxe-helper.sh.tmpl", { domain = var.node_domain, name = var.node_name, mac = var.node_mac, pxe_commands = var.pxe_commands, asset_dir = var.asset_dir, kernel_args = join(" ", var.kernel_args), kernel_console = join(" ", var.kernel_console), ignition_endpoint = format("%s/ignition", var.http_endpoint), ignore_changes = var.ignore_changes })
13+
}
14+
}

assets/terraform-modules/matchbox-flatcar/templates/install.yaml.tmpl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ storage:
4949
wipefs -f -a "$${disk}" || echo "error: failed to wipe $${disk}"
5050
done
5151
%{~ endif ~}
52-
curl --retry 10 "${ignition_endpoint}?{{.request.raw_query}}&os=installed" -o ignition.json
5352
flatcar-install \
5453
%{~ if install_to_smallest_disk ~}
5554
-s \
@@ -59,16 +58,16 @@ storage:
5958
-C ${os_channel} \
6059
-V ${os_version} \
6160
-o "${container_linux_oem}" \
62-
${baseurl_flag} \
63-
-i ignition.json
61+
${baseurl_flag}
6462
udevadm settle
6563
OEM_DEV="$(blkid -t "LABEL=OEM" -o device)"
6664
mkdir -p /tmp/oemfs
6765
mount "$${OEM_DEV}" /tmp/oemfs
6866
# append to file on newly created partition, do not remove the defaults
69-
echo 'set linux_append="${kernel_args}"' >> /tmp/oemfs/grub.cfg
67+
echo 'set linux_append="${kernel_args} ignition.config.url=${ignition_endpoint}?mac=${mac_address}&os=installed"' >> /tmp/oemfs/grub.cfg
7068
echo 'set linux_console="${kernel_console}"' >> /tmp/oemfs/grub.cfg
7169
umount /tmp/oemfs
70+
${install_pre_reboot_cmds}
7271
systemctl reboot
7372
passwd:
7473
users:

0 commit comments

Comments
 (0)