Skip to content

Commit 9a79ff4

Browse files
committed
[1.3] libct: reset CPU affinity by default
In certain deployments, it's possible for runc to be spawned by a process with a restrictive cpumask (such as from a systemd unit with CPUAffinity=... configured) which will be inherited by runc and thus the container process by default. The cpuset cgroup used to reconfigure the cpumask automatically for joining processes, but kcommit da019032819a ("sched: Enforce user requested affinity") changed this behaviour in Linux 6.2. The solution is to try to emulate the expected behaviour by resetting our cpumask to correspond with the configured cpuset (in the case of "runc exec", if the user did not configure an alternative one). Normally we would have to parse /proc/stat and /sys/fs/cgroup, but luckily sched_setaffinity(2) will transparently convert an all-set cpumask (even if it has more entries than the number of CPUs on the system) to the correct value for our usecase. For some reason, in our CI it seems that rootless --systemd-cgroup results in the cpuset (presumably temporarily?) being configured such that sched_setaffinity(2) will allow the full set of CPUs. For this particular case, all we care about is that it is different to the original set, so include some special-casing (but we should probably investigate this further...). Reported-by: ningmingxiao <[email protected]> Reported-by: Martin Sivak <[email protected]> Reported-by: Peter Hunt <[email protected]> Signed-off-by: Aleksa Sarai <[email protected]> (Cherry-pick of commit 121192a.) Signed-off-by: Aleksa Sarai <[email protected]>
1 parent ae7eefd commit 9a79ff4

File tree

3 files changed

+165
-1
lines changed

3 files changed

+165
-1
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased 1.3.z]
88

9+
### Fixed
10+
* Container processes will no longer inherit the CPU affinity of runc by
11+
default. Instead, the default CPU affinity of container processes will be
12+
the largest set of CPUs permitted by the container's cpuset cgroup and any
13+
other system restrictions (such as isolated CPUs). (#4041, #4815, #4858)
14+
915
## [1.3.0] - 2025-04-30
1016

1117
> Mr. President, we must not allow a mine shaft gap!

libcontainer/process_linux.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,46 @@ type setnsProcess struct {
163163
initProcessPid int
164164
}
165165

166+
// tryResetCPUAffinity tries to reset the CPU affinity of the process
167+
// identified by pid to include all possible CPUs (notwithstanding cgroup
168+
// cpuset restrictions and isolated CPUs).
169+
func tryResetCPUAffinity(pid int) {
170+
// When resetting the CPU affinity, we want to match the configured cgroup
171+
// cpuset (or the default set of all CPUs, if no cpuset is configured)
172+
// rather than some more restrictive affinity we were spawned in (such as
173+
// one that may have been inherited from systemd). The cpuset cgroup used
174+
// to reconfigure the cpumask automatically for joining processes, but
175+
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
176+
// this behaviour in Linux 6.2.
177+
//
178+
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
179+
// things like /proc/stat would be wrong for most nested containers), but
180+
// luckily sched_setaffinity(2) will implicitly:
181+
//
182+
// * Clamp the cpumask so that it matches the current number of CPUs on
183+
// the system.
184+
// * Mask out any CPUs that are not a member of the target task's
185+
// configured cgroup cpuset.
186+
//
187+
// So we can just pass a very large array of set cpumask bits and the
188+
// kernel will silently convert that to the correct value very cheaply.
189+
190+
// Ideally, we would just set the array to 0xFF...FF. Unfortunately, the
191+
// size depends on the architecture. It is also a private newtype, so we
192+
// can't use (^0) or generics since those require us to be able to name the
193+
// type. However, we can just underflow the zero value instead.
194+
// TODO: Once <https://golang.org/cl/698015> is merged, switch to that.
195+
cpuset := unix.CPUSet{}
196+
for i := range cpuset {
197+
cpuset[i]-- // underflow to 0xFF..FF
198+
}
199+
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
200+
logrus.WithError(
201+
os.NewSyscallError("sched_setaffinity", err),
202+
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
203+
}
204+
}
205+
166206
// Starts setns process with specified initial CPU affinity.
167207
func (p *setnsProcess) startWithCPUAffinity() error {
168208
aff := p.config.CPUAffinity
@@ -193,7 +233,13 @@ func (p *setnsProcess) startWithCPUAffinity() error {
193233

194234
func (p *setnsProcess) setFinalCPUAffinity() error {
195235
aff := p.config.CPUAffinity
196-
if aff == nil || aff.Final == nil {
236+
// If there was no affinity configured at all, we want to reset
237+
// the affinity to make sure we don't inherit an unexpected one.
238+
if aff == nil || aff.Final == nil && aff.Initial == nil {
239+
tryResetCPUAffinity(p.pid())
240+
return nil
241+
}
242+
if aff.Final == nil {
197243
return nil
198244
}
199245
if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
@@ -619,6 +665,9 @@ func (p *initProcess) start() (retErr error) {
619665
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
620666
}
621667
}
668+
// Reset the CPU affinity after cgroups are configured to make sure it
669+
// matches any configured cpuset.
670+
tryResetCPUAffinity(p.pid())
622671
if p.intelRdtManager != nil {
623672
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
624673
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)

tests/integration/cpu_affinity.bats

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,14 @@
44

55
load helpers
66

7+
INITIAL_CPU_MASK="$(grep -F Cpus_allowed_list: /proc/self/status | awk '{ print $2 }')"
8+
79
function setup() {
810
requires smp cgroups_cpuset
911
setup_busybox
12+
13+
echo "Initial CPU mask: $INITIAL_CPU_MASK" >&2
14+
echo "---" >&2
1015
}
1116

1217
function teardown() {
@@ -99,3 +104,107 @@ function cpus_to_mask() {
99104
[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
100105
[[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
101106
}
107+
108+
@test "runc run [CPU affinity should reset]" {
109+
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
110+
# bash function (which is what runc and __runc are).
111+
setup_runc_cmdline
112+
113+
first="$(first_cpu)"
114+
115+
# Running without cpuset should result in an affinity for all CPUs.
116+
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
117+
update_config 'del(.linux.resources.cpu)'
118+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
119+
[ "$status" -eq 0 ]
120+
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
121+
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
122+
}
123+
124+
@test "runc run [CPU affinity should reset to cgroup cpuset]" {
125+
[ $EUID -ne 0 ] && requires rootless_cgroup
126+
set_cgroups_path
127+
128+
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
129+
# bash function (which is what runc and __runc are).
130+
setup_runc_cmdline
131+
132+
first="$(first_cpu)"
133+
second="$((first + 1))" # Hacky; might not work in all environments.
134+
135+
# Running with a cpuset should result in an affinity that matches.
136+
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
137+
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
138+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
139+
[ "$status" -eq 0 ]
140+
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
141+
# XXX: For some reason, systemd-cgroup leads to us using the all-set
142+
# cpumask rather than the cpuset we configured?
143+
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]
144+
145+
# Ditto for a cpuset that has no overlap with the original cpumask.
146+
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
147+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
148+
[ "$status" -eq 0 ]
149+
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
150+
# XXX: For some reason, systemd-cgroup leads to us using the all-set
151+
# cpumask rather than the cpuset we configured?
152+
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
153+
}
154+
155+
@test "runc exec [default CPU affinity should reset]" {
156+
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
157+
# bash function (which is what runc and __runc are).
158+
setup_runc_cmdline
159+
160+
first="$(first_cpu)"
161+
162+
# Running without cpuset should result in an affinity for all CPUs.
163+
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
164+
update_config 'del(.linux.resources.cpu)'
165+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr3
166+
[ "$status" -eq 0 ]
167+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr3 grep -F Cpus_allowed_list: /proc/self/status
168+
[ "$status" -eq 0 ]
169+
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
170+
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
171+
}
172+
173+
@test "runc exec [default CPU affinity should reset to cgroup cpuset]" {
174+
[ $EUID -ne 0 ] && requires rootless_cgroup
175+
set_cgroups_path
176+
177+
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
178+
# bash function (which is what runc and __runc are).
179+
setup_runc_cmdline
180+
181+
first="$(first_cpu)"
182+
second="$((first + 1))" # Hacky; might not work in all environments.
183+
184+
# Running with a cpuset should result in an affinity that matches.
185+
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
186+
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
187+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
188+
[ "$status" -eq 0 ]
189+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
190+
[ "$status" -eq 0 ]
191+
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
192+
# XXX: For some reason, systemd-cgroup leads to us using the all-set
193+
# cpumask rather than the cpuset we configured?
194+
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]
195+
196+
# Stop the container so we can reconfigure it.
197+
runc delete -f ctr
198+
[ "$status" -eq 0 ]
199+
200+
# Ditto for a cpuset that has no overlap with the original cpumask.
201+
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
202+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
203+
[ "$status" -eq 0 ]
204+
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
205+
[ "$status" -eq 0 ]
206+
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
207+
# XXX: For some reason, systemd-cgroup leads to us using the all-set
208+
# cpumask rather than the cpuset we configured?
209+
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
210+
}

0 commit comments

Comments
 (0)