From 24c0804a19cd49f83c02fc90d8c263780d4a04a5 Mon Sep 17 00:00:00 2001 From: Alistair Smith Date: Thu, 9 Apr 2026 14:28:44 -0700 Subject: [PATCH 01/15] shard sema and codegen across N llvm modules - ZIG_PARALLEL_SEMA: Sema runs concurrently across worker threads with per-unit claim/wait, retry-on-dependency-cycle, and per-map mutexes replacing the global sema_lock for the non-incremental fast path. - InternPool: thread-safe writers (locked single-field setters, seqlock on getNav, sorted-shard prelocking for getFunc*Ies, 256 hash shards). - llvm backend: PartitionSet emits N independent llvm modules in parallel; cross-shard refs are linkonce_odr; --llvm-codegen-threads=N partitions by file path; --llvm-no-merge-shards leaves shard .o files unmerged. - link.MachO -r: handle N shard inputs; emit hidden defs as private-extern; convert tentatives so Apple ld_new accepts the merged object. - link.Elf: handle N shard inputs; batch preads in writeRelocatable to avoid per-atom syscall storm under heavy COMDAT section counts. - link.Lld: pass all shard paths to lld for elf/coff/wasm. - std.Build.Step.Compile: llvm_codegen_threads, llvm_no_merge_shards. --- .gitignore | 1 + build.zig | 17 +- lib/std/Build/Step/Compile.zig | 8 + src/Air.zig | 1 + src/Air/types_resolved.zig | 27 +- src/Compilation.zig | 291 ++++++++++++++++- src/InternPool.zig | 558 ++++++++++++++++++++++++++++----- src/Sema.zig | 424 ++++++++++++++++++++----- src/Type.zig | 208 ++++++++++-- src/Zcu.zig | 374 +++++++++++++++++++++- src/Zcu/PerThread.zig | 458 +++++++++++++++++++-------- src/codegen/llvm.zig | 479 +++++++++++++++++++++++++--- src/link.zig | 24 ++ src/link/Elf/AtomList.zig | 72 ++++- src/link/Lld.zig | 51 +-- src/link/MachO.zig | 56 ++-- src/link/MachO/Object.zig | 9 +- src/link/MachO/Symbol.zig | 2 +- src/link/MachO/file.zig | 7 +- src/link/MachO/relocatable.zig | 17 +- src/main.zig | 14 + src/target.zig | 6 +- src/zig_llvm.cpp | 204 ++++++------ 23 files changed, 2735 insertions(+), 573 deletions(-) diff --git a/.gitignore b/.gitignore index 7e9e15820297..5fb4854a4a33 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ zig-out/ # Although this was renamed to .zig-cache, let's leave it here for a few # releases to make it less annoying to work with multiple branches. zig-cache/ +bun-cache/ diff --git a/build.zig b/build.zig index 745d8070f83a..ea743617405f 100644 --- a/build.zig +++ b/build.zig @@ -123,6 +123,11 @@ pub fn build(b: *std.Build) !void { "llvm-has-xtensa", "Whether LLVM has the experimental target xtensa enabled", ) orelse false; + const llvm_has_polly = b.option( + bool, + "llvm-has-polly", + "Whether LLVM was built with Polly and requires linking it", + ) orelse false; const enable_ios_sdk = b.option(bool, "enable-ios-sdk", "Run tests requiring presence of iOS SDK and frameworks") orelse false; const enable_macos_sdk = b.option(bool, "enable-macos-sdk", "Run tests requiring presence of macOS SDK and frameworks") orelse enable_ios_sdk; const enable_symlinks_windows = b.option(bool, "enable-symlinks-windows", "Run tests requiring presence of symlinks on Windows") orelse false; @@ -332,6 +337,7 @@ pub fn build(b: *std.Build) !void { .llvm_has_csky = llvm_has_csky, .llvm_has_arc = llvm_has_arc, .llvm_has_xtensa = llvm_has_xtensa, + .llvm_has_polly = llvm_has_polly, }); } if (target.result.os.tag == .windows) { @@ -739,7 +745,7 @@ fn addCompilerMod(b: *std.Build, options: AddCompilerModOptions) *std.Build.Modu fn addCompilerStep(b: *std.Build, options: AddCompilerModOptions) *std.Build.Step.Compile { const exe = b.addExecutable(.{ .name = "zig", - .max_rss = 10_000_000_000, + .max_rss = 11_000_000_000, .root_module = addCompilerMod(b, options), }); exe.stack_size = stack_size; @@ -858,6 +864,7 @@ fn addStaticLlvmOptionsToModule(mod: *std.Build.Module, options: struct { llvm_has_csky: bool, llvm_has_arc: bool, llvm_has_xtensa: bool, + llvm_has_polly: bool, }) !void { // Adds the Zig C++ sources which both stage1 and stage2 need. // @@ -898,6 +905,10 @@ fn addStaticLlvmOptionsToModule(mod: *std.Build.Module, options: struct { mod.linkSystemLibrary(lib_name, .{}); }; + if (options.llvm_has_polly) for (llvm_libs_polly) |lib_name| { + mod.linkSystemLibrary(lib_name, .{}); + }; + mod.linkSystemLibrary("z", .{}); mod.linkSystemLibrary("zstd", .{}); @@ -1419,6 +1430,10 @@ const llvm_libs_xtensa = [_][]const u8{ "LLVMXtensaDesc", "LLVMXtensaInfo", }; +const llvm_libs_polly = [_][]const u8{ + "Polly", + "PollyISL", +}; fn generateLangRef(b: *std.Build) std.Build.LazyPath { const doctest_exe = b.addExecutable(.{ diff --git a/lib/std/Build/Step/Compile.zig b/lib/std/Build/Step/Compile.zig index fc23d2da389a..b9cc970264f4 100644 --- a/lib/std/Build/Step/Compile.zig +++ b/lib/std/Build/Step/Compile.zig @@ -162,6 +162,11 @@ dead_strip_dylibs: bool = false, /// When enabled, outputs multiple .o files: filename.0.o, filename.1.o, etc. llvm_codegen_threads: u32 = 0, +/// Skip the relocatable -r merge of partitioned LLVM output. The shard +/// objects are emitted directly to `{emit}.{i}.o` for the downstream linker +/// to consume. Only meaningful when `llvm_codegen_threads > 1`. +llvm_no_merge_shards: bool = false, + /// Skip linker step for build-obj - outputs raw LLVM object file(s). /// Saves time by avoiding parse/resolve/write cycle. no_link_obj: bool = false, @@ -1532,6 +1537,9 @@ fn getZigArgs(compile: *Compile, fuzz: bool) ![][]const u8 { if (compile.llvm_codegen_threads > 0) { try zig_args.append(b.fmt("--llvm-codegen-threads={d}", .{compile.llvm_codegen_threads})); } + if (compile.llvm_no_merge_shards) { + try zig_args.append("--llvm-no-merge-shards"); + } if (compile.no_link_obj) { try zig_args.append("--no-link"); } diff --git a/src/Air.zig b/src/Air.zig index 77080386384d..97dcc52c44b9 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -2154,6 +2154,7 @@ pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index } pub const typesFullyResolved = types_resolved.typesFullyResolved; +pub const resolveTypesFully = types_resolved.resolveTypesFully; pub const typeFullyResolved = types_resolved.checkType; pub const valFullyResolved = types_resolved.checkVal; pub const legalize = Legalize.legalize; diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig index 44669b82df87..8c4c69fa9c63 100644 --- a/src/Air/types_resolved.zig +++ b/src/Air/types_resolved.zig @@ -10,6 +10,21 @@ pub fn typesFullyResolved(air: Air, zcu: *Zcu) bool { return checkBody(air, air.getMainBody(), zcu); } +/// Under parallel Sema, `resolve_type_fully` and `codegen_func` run +/// concurrently, so types may be mid-resolution rather than failed. Walk the +/// same AIR shape as `typesFullyResolved` but force-resolve each struct/union +/// (blocking on `claimOrWait`-gated resolution). Returns false only if +/// resolution itself errors. +pub fn resolveTypesFully(air: Air, pt: Zcu.PerThread) bool { + tls_resolve_pt = pt; + defer tls_resolve_pt = null; + return checkBody(air, air.getMainBody(), pt.zcu); +} + +/// `checkType` is reached via a long instruction walk; thread the optional +/// PerThread via tls instead of plumbing it through every switch arm. +threadlocal var tls_resolve_pt: ?Zcu.PerThread = null; + fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { const tags = air.instructions.items(.tag); const datas = air.instructions.items(.data); @@ -513,6 +528,10 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool { }, .@"struct" => switch (ip.indexToKey(ty.toIntern())) { .struct_type => { + if (tls_resolve_pt) |pt| { + ty.resolveFully(pt) catch return false; + return true; + } const struct_obj = zcu.typeToStruct(ty).?; return switch (struct_obj.layout) { .@"packed" => struct_obj.backingIntTypeUnordered(ip) != .none, @@ -530,6 +549,12 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool { }, else => unreachable, }, - .@"union" => return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved, + .@"union" => { + if (tls_resolve_pt) |pt| { + ty.resolveFully(pt) catch return false; + return true; + } + return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved; + }, }; } diff --git a/src/Compilation.zig b/src/Compilation.zig index 4f4362459c3b..b0519296e51b 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -43,6 +43,7 @@ const Zir = std.zig.Zir; const Air = @import("Air.zig"); const Builtin = @import("Builtin.zig"); const LlvmObject = @import("codegen/llvm.zig").Object; +const LlvmPartitionSet = @import("codegen/llvm.zig").PartitionSet; const dev = @import("dev.zig"); const DeprecatedLinearFifo = @import("deprecated.zig").LinearFifo; @@ -125,6 +126,8 @@ work_queues: [ break :len len; } ]DeprecatedLinearFifo(Job), +/// Protects `work_queues` when Sema runs on worker threads and calls `queueJob`. +work_queue_mutex: std.Thread.Mutex = .{}, /// These jobs are to invoke the Clang compiler to create an object file, which /// gets linked with the Compilation. @@ -265,7 +268,12 @@ link_prog_node: std.Progress.Node = std.Progress.Node.none, llvm_opt_bisect_limit: c_int, llvm_codegen_threads: u32, +llvm_shard_stats: bool, no_link_obj: bool, +/// When true, the N shard `.o` files emitted by partitioned LLVM codegen are +/// left as-is (no relocatable -r merge). They land at `{emit}.{i}.o` next to +/// the would-be merged output. The downstream linker consumes them directly. +no_merge_shards: bool, time_report: ?TimeReport, @@ -1729,7 +1737,9 @@ pub const CreateOptions = struct { linker_print_map: bool = false, llvm_opt_bisect_limit: i32 = -1, llvm_codegen_threads: u32 = 0, + llvm_shard_stats: bool = false, no_link_obj: bool = false, + llvm_no_merge_shards: bool = false, build_id: ?std.zig.BuildId = null, disable_c_depfile: bool = false, linker_z_nodelete: bool = false, @@ -2298,7 +2308,15 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options .framework_dirs = options.framework_dirs, .llvm_opt_bisect_limit = options.llvm_opt_bisect_limit, .llvm_codegen_threads = options.llvm_codegen_threads, - .no_link_obj = options.no_link_obj, + .llvm_shard_stats = options.llvm_shard_stats, + // Partitioned LLVM output produces N objects which must be merged + // by the linker for a single-.o result, so the no-link shortcut + // does not apply unless `--llvm-no-merge-shards` is also set, in + // which case the N shard `.o` files are emitted directly to the + // final location and the relocatable merge is skipped entirely. + .no_link_obj = options.no_link_obj and + (options.llvm_codegen_threads <= 1 or options.llvm_no_merge_shards), + .no_merge_shards = options.llvm_no_merge_shards and options.llvm_codegen_threads > 1, .skip_linker_dependencies = options.skip_linker_dependencies, .queued_jobs = .{}, .function_sections = options.function_sections, @@ -2506,7 +2524,16 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options if (use_llvm) { if (opt_zcu) |zcu| { - zcu.llvm_object = try LlvmObject.create(arena, comp); + // Multi-shard emission only supports producing N object files + // for the linker; IR/BC/asm requests for a single output would + // silently drop shards 1..N. Clamp to 1 in that case. + const single_artifact_only = options.emit_bin == .no and + (options.emit_llvm_ir != .no or options.emit_llvm_bc != .no or options.emit_asm != .no); + const n_shards: u32 = if (options.llvm_codegen_threads <= 1 or single_artifact_only) + 1 + else + options.llvm_codegen_threads; + zcu.llvm_object = try LlvmPartitionSet.create(arena, comp, n_shards); } } @@ -3129,7 +3156,13 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE try pt.populateTestFunctions(); } + comp.phaseTimingC("update.processExports.start"); try pt.processExports(); + comp.phaseTimingC("update.processExports.done"); + } + + if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) { + comp.dumpLlvmShardStats(zcu); } if (build_options.enable_debug_extensions and comp.verbose_intern_pool) { @@ -3267,6 +3300,65 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE } } +fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void { + const ip = &zcu.intern_pool; + const n: u32 = if (comp.llvm_codegen_threads > 1) comp.llvm_codegen_threads else 16; + var counts = [_]u32{0} ** 256; + var top_file = [_]?*Zcu.File{null} ** 256; + var top_file_count = [_]u32{0} ** 256; + + var per_file = std.AutoHashMap(*Zcu.File, u32).init(comp.gpa); + defer per_file.deinit(); + + const total_navs = ip.navCount(); + var skipped: u32 = 0; + var i: u32 = 0; + while (i < total_navs) : (i += 1) { + const nav_index = ip.navIndexFromOrdinal(i); + const nav = ip.getNav(nav_index); + if (nav.status == .unresolved) { + skipped += 1; + continue; + } + const fqn = nav.fqn.toSlice(ip); + const shard: u8 = @intCast(std.hash.Wyhash.hash(0, fqn) % n); + counts[shard] += 1; + const file = zcu.fileByIndex(nav.srcInst(ip).resolveFile(ip)); + const gop = per_file.getOrPut(file) catch continue; + if (!gop.found_existing) gop.value_ptr.* = 0; + gop.value_ptr.* += 1; + if (gop.value_ptr.* > top_file_count[shard]) { + top_file_count[shard] = gop.value_ptr.*; + top_file[shard] = file; + } + } + + var min: u32 = std.math.maxInt(u32); + var max: u32 = 0; + var nonempty: u32 = 0; + for (counts[0..n]) |c| { + if (c == 0) continue; + nonempty += 1; + min = @min(min, c); + max = @max(max, c); + } + std.debug.print("llvm-shard-stats for '{s}': n={d} navs={d} skipped={d} nonempty_shards={d}\n", .{ + comp.root_name, n, total_navs - skipped, skipped, nonempty, + }); + for (counts[0..n], 0..) |c, s| { + if (c == 0) continue; + var buf: [512]u8 = undefined; + const key = if (top_file[s]) |f| f.shardKey(&buf) else ""; + std.debug.print(" shard {d:>3}: {d:>6} navs (top file '{s}' = {d})\n", .{ + s, c, key, top_file_count[s], + }); + } + if (min != std.math.maxInt(u32)) { + const ratio = @as(f64, @floatFromInt(max)) / @as(f64, @floatFromInt(min)); + std.debug.print(" max/min ratio: {d:.2} (max={d}, min={d})\n", .{ ratio, max, min }); + } +} + pub fn appendFileSystemInput(comp: *Compilation, path: Compilation.Path) Allocator.Error!void { const gpa = comp.gpa; const fsi = comp.file_system_inputs orelse return; @@ -3336,6 +3428,7 @@ fn flush( arena: Allocator, tid: Zcu.PerThread.Id, ) Allocator.Error!void { + comp.phaseTimingC("flush.start"); if (comp.zcu) |zcu| { if (zcu.llvm_object) |llvm_object| { const pt: Zcu.PerThread = .activate(zcu, tid); @@ -3364,8 +3457,8 @@ fn flush( }; // Generate parallel codegen output filenames if enabled - const bin_path_list: ?[]const [*:0]const u8 = if (comp.llvm_codegen_threads > 1 and base_bin_path != null) blk: { - const num_threads = comp.llvm_codegen_threads; + const bin_path_list: ?[]const [*:0]const u8 = if (llvm_object.n > 1 and base_bin_path != null) blk: { + const num_threads = llvm_object.n; const list = try arena.alloc([*:0]const u8, num_threads); const base_path_slice = std.mem.sliceTo(base_bin_path.?, 0); @@ -3414,6 +3507,7 @@ fn flush( error.LinkFailure => {}, // Already reported. error.OutOfMemory => return error.OutOfMemory, }; + comp.phaseTimingC("flush.llvm_emit_done"); } } if (comp.bin_file) |lf| { @@ -3430,7 +3524,14 @@ fn flush( error.LinkFailure => {}, // Already reported. error.OutOfMemory => return error.OutOfMemory, }; + } else if (comp.no_merge_shards) { + // Shard objects went to `{emit}.{i}.o`; the 0-byte stub the linker + // created at `{emit}` during open() will never be flushed. Remove + // it so downstream build systems globbing `{emit}.*.o` aren't + // confused by an empty object alongside the real shards. + lf.emit.root_dir.handle.deleteFile(lf.emit.sub_path) catch {}; } + comp.phaseTimingC("flush.lf_flush_done"); } if (comp.zcu) |zcu| { try link.File.C.flushEmitH(zcu); @@ -4629,10 +4730,20 @@ pub fn unableToLoadZcuFile( }); } +pub fn phaseTiming(label: []const u8) void { + if (!std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) return; + std.debug.print("[PHASE] {d} - {s}\n", .{ std.time.milliTimestamp(), label }); +} +fn phaseTimingC(comp: *const Compilation, label: []const u8) void { + if (!std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) return; + std.debug.print("[PHASE] {d} {s} {s}\n", .{ std.time.milliTimestamp(), comp.root_name, label }); +} + fn performAllTheWork( comp: *Compilation, main_progress_node: std.Progress.Node, ) JobError!void { + comp.phaseTimingC("performAllTheWork.start"); // Regardless of errors, `comp.zcu` needs to update its generation number. defer if (comp.zcu) |zcu| { zcu.generation += 1; @@ -4657,8 +4768,10 @@ fn performAllTheWork( var work_queue_wait_group: WaitGroup = .{}; defer work_queue_wait_group.wait(); + defer comp.phaseTimingC("performAllTheWork.codegen_wait_done"); comp.link_task_wait_group.reset(); defer comp.link_task_wait_group.wait(); + defer comp.phaseTimingC("performAllTheWork.work_loop_done"); // Already-queued prelink tasks comp.link_prog_node.increaseEstimatedTotalItems(comp.link_task_queue.queued_prelink.items.len); @@ -5059,13 +5172,62 @@ fn performAllTheWork( // Start the timer for the "decls" part of the pipeline (Sema, CodeGen, link). decl_work_timer = comp.startTimer(); } + comp.phaseTimingC("performAllTheWork.work_loop_start"); + if (comp.zcu) |zcu| { + // Sub-compilations (compiler_rt, ubsan_rt, etc.) and the build runner + // are small and gain nothing from parallel Sema. For `zig build`, the + // runner is `root_mod` (main_mod is the user's build.zig). + const is_build_runner = std.mem.endsWith(u8, zcu.root_mod.root_src_path, "build_runner.zig"); + zcu.parallel_sema = comp.parent_whole_cache == null and + !is_build_runner and + std.process.hasNonEmptyEnvVarConstant("ZIG_PARALLEL_SEMA"); + } + + var job_ns: [@typeInfo(Job.Tag).@"enum".fields.len]u64 = @splat(0); + var job_ct: [@typeInfo(Job.Tag).@"enum".fields.len]u64 = @splat(0); + var export_func_pass: u8 = 0; work: while (true) { - for (&comp.work_queues) |*work_queue| if (work_queue.readItem()) |job| { + const maybe_job: ?Job = job: { + comp.work_queue_mutex.lock(); + defer comp.work_queue_mutex.unlock(); + for (&comp.work_queues) |*work_queue| if (work_queue.readItem()) |job| break :job job; + break :job null; + }; + if (maybe_job) |job| { + if (comp.zcu) |zcu| if (zcu.parallel_sema and job == .analyze_func) { + // Skip dispatch if a worker already holds this unit (or it has + // since been analyzed) — re-queues from the retry path can + // produce duplicate analyze_func jobs and N-1 workers then + // condvar-wait on the one analyzer. + const a = zcu.intern_pool.funcAnalysisUnordered(job.analyze_func); + if (a.is_analyzed) continue :work; + _ = zcu.sema_pending_jobs.rmw(.Add, 1, .acquire); + comp.thread_pool.spawnWgId(&comp.link_task_wait_group, workerAnalyzeFunc, .{ comp, job.analyze_func }); + continue :work; + }; + const t0 = if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) std.time.nanoTimestamp() else 0; try processOneJob(@intFromEnum(Zcu.PerThread.Id.main), comp, job); + if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) { + job_ns[@intFromEnum(@as(Job.Tag, job))] += @intCast(std.time.nanoTimestamp() - t0); + job_ct[@intFromEnum(@as(Job.Tag, job))] += 1; + } continue :work; - }; + } if (comp.zcu) |zcu| { + if (zcu.sema_pending_jobs.load(.acquire) > 0) { + std.Thread.yield() catch {}; + continue :work; + } + // A worker may have enqueued between our queue read and the + // counter dropping to zero; re-check the queues before exiting. + const drained = drained: { + comp.work_queue_mutex.lock(); + defer comp.work_queue_mutex.unlock(); + for (&comp.work_queues) |*q| if (q.count > 0) break :drained false; + break :drained true; + }; + if (!drained) continue :work; // If there's no work queued, check if there's anything outdated // which we need to work on, and queue it if so. if (try zcu.findOutdatedToAnalyze()) |outdated| { @@ -5080,16 +5242,43 @@ fn performAllTheWork( }); continue; } + // Final pass under parallel Sema: any exported function whose body + // analysis was dropped by a post-commit retry will not be in + // `nav_map` at processExports time. Re-queue here so the work loop + // drains it before we exit. + if (zcu.parallel_sema and export_func_pass < 3) { + export_func_pass += 1; + var any_queued = false; + for (zcu.single_exports.values()) |idx| { + any_queued = ensureExportFuncQueued(zcu, idx) or any_queued; + } + for (zcu.multi_exports.values()) |info| { + for (info.index..info.index + info.len) |i| { + any_queued = ensureExportFuncQueued(zcu, @enumFromInt(i)) or any_queued; + } + } + if (any_queued) continue; + } zcu.sema_prog_node.end(); zcu.sema_prog_node = .none; } break; } + if (comp.zcu) |zcu| zcu.parallel_sema = false; + if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) { + std.debug.print("=== work loop job timings (main thread) ===\n", .{}); + inline for (@typeInfo(Job.Tag).@"enum".fields, 0..) |f, i| { + if (job_ct[i] != 0) + std.debug.print(" {s:>24}: {d:>6}ms ({d} jobs)\n", .{ f.name, job_ns[i] / 1_000_000, job_ct[i] }); + } + } } const JobError = Allocator.Error; pub fn queueJob(comp: *Compilation, job: Job) !void { + comp.work_queue_mutex.lock(); + defer comp.work_queue_mutex.unlock(); try comp.work_queues[Job.stage(job)].writeItem(job); } @@ -5108,7 +5297,20 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void { comp.link_prog_node.completeOne(); air.deinit(gpa); } - if (!air.typesFullyResolved(zcu)) { + // Under serial Sema, FIFO dispatch guarantees every + // `resolve_type_fully` queued before this body's analysis has + // completed, so `typesFullyResolved == false` means the type + // *failed*. Under parallel Sema both job kinds run concurrently — + // a struct or union may simply be mid-resolution. Dropping the + // body would leave a dangling cross-shard `__N