From 24c0804a19cd49f83c02fc90d8c263780d4a04a5 Mon Sep 17 00:00:00 2001
From: Alistair Smith <hi@alistair.sh>
Date: Thu, 9 Apr 2026 14:28:44 -0700
Subject: [PATCH 01/15] shard sema and codegen across N llvm modules

- ZIG_PARALLEL_SEMA: Sema runs concurrently across worker threads with
  per-unit claim/wait, retry-on-dependency-cycle, and per-map mutexes
  replacing the global sema_lock for the non-incremental fast path.
- InternPool: thread-safe writers (locked single-field setters, seqlock
  on getNav, sorted-shard prelocking for getFunc*Ies, 256 hash shards).
- llvm backend: PartitionSet emits N independent llvm modules in parallel;
  cross-shard refs are linkonce_odr; --llvm-codegen-threads=N partitions
  by file path; --llvm-no-merge-shards leaves shard .o files unmerged.
- link.MachO -r: handle N shard inputs; emit hidden defs as private-extern;
  convert tentatives so Apple ld_new accepts the merged object.
- link.Elf: handle N shard inputs; batch preads in writeRelocatable to
  avoid per-atom syscall storm under heavy COMDAT section counts.
- link.Lld: pass all shard paths to lld for elf/coff/wasm.
- std.Build.Step.Compile: llvm_codegen_threads, llvm_no_merge_shards.
---
 .gitignore                     |   1 +
 build.zig                      |  17 +-
 lib/std/Build/Step/Compile.zig |   8 +
 src/Air.zig                    |   1 +
 src/Air/types_resolved.zig     |  27 +-
 src/Compilation.zig            | 291 ++++++++++++++++-
 src/InternPool.zig             | 558 ++++++++++++++++++++++++++++-----
 src/Sema.zig                   | 424 ++++++++++++++++++++-----
 src/Type.zig                   | 208 ++++++++++--
 src/Zcu.zig                    | 374 +++++++++++++++++++++-
 src/Zcu/PerThread.zig          | 458 +++++++++++++++++++--------
 src/codegen/llvm.zig           | 479 +++++++++++++++++++++++++---
 src/link.zig                   |  24 ++
 src/link/Elf/AtomList.zig      |  72 ++++-
 src/link/Lld.zig               |  51 +--
 src/link/MachO.zig             |  56 ++--
 src/link/MachO/Object.zig      |   9 +-
 src/link/MachO/Symbol.zig      |   2 +-
 src/link/MachO/file.zig        |   7 +-
 src/link/MachO/relocatable.zig |  17 +-
 src/main.zig                   |  14 +
 src/target.zig                 |   6 +-
 src/zig_llvm.cpp               | 204 ++++++------
 23 files changed, 2735 insertions(+), 573 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7e9e15820297..5fb4854a4a33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ zig-out/
 # Although this was renamed to .zig-cache, let's leave it here for a few
 # releases to make it less annoying to work with multiple branches.
 zig-cache/
+bun-cache/
diff --git a/build.zig b/build.zig
index 745d8070f83a..ea743617405f 100644
--- a/build.zig
+++ b/build.zig
@@ -123,6 +123,11 @@ pub fn build(b: *std.Build) !void {
         "llvm-has-xtensa",
         "Whether LLVM has the experimental target xtensa enabled",
     ) orelse false;
+    const llvm_has_polly = b.option(
+        bool,
+        "llvm-has-polly",
+        "Whether LLVM was built with Polly and requires linking it",
+    ) orelse false;
     const enable_ios_sdk = b.option(bool, "enable-ios-sdk", "Run tests requiring presence of iOS SDK and frameworks") orelse false;
     const enable_macos_sdk = b.option(bool, "enable-macos-sdk", "Run tests requiring presence of macOS SDK and frameworks") orelse enable_ios_sdk;
     const enable_symlinks_windows = b.option(bool, "enable-symlinks-windows", "Run tests requiring presence of symlinks on Windows") orelse false;
@@ -332,6 +337,7 @@ pub fn build(b: *std.Build) !void {
                 .llvm_has_csky = llvm_has_csky,
                 .llvm_has_arc = llvm_has_arc,
                 .llvm_has_xtensa = llvm_has_xtensa,
+                .llvm_has_polly = llvm_has_polly,
             });
         }
         if (target.result.os.tag == .windows) {
@@ -739,7 +745,7 @@ fn addCompilerMod(b: *std.Build, options: AddCompilerModOptions) *std.Build.Modu
 fn addCompilerStep(b: *std.Build, options: AddCompilerModOptions) *std.Build.Step.Compile {
     const exe = b.addExecutable(.{
         .name = "zig",
-        .max_rss = 10_000_000_000,
+        .max_rss = 11_000_000_000,
         .root_module = addCompilerMod(b, options),
     });
     exe.stack_size = stack_size;
@@ -858,6 +864,7 @@ fn addStaticLlvmOptionsToModule(mod: *std.Build.Module, options: struct {
     llvm_has_csky: bool,
     llvm_has_arc: bool,
     llvm_has_xtensa: bool,
+    llvm_has_polly: bool,
 }) !void {
     // Adds the Zig C++ sources which both stage1 and stage2 need.
     //
@@ -898,6 +905,10 @@ fn addStaticLlvmOptionsToModule(mod: *std.Build.Module, options: struct {
         mod.linkSystemLibrary(lib_name, .{});
     };
 
+    if (options.llvm_has_polly) for (llvm_libs_polly) |lib_name| {
+        mod.linkSystemLibrary(lib_name, .{});
+    };
+
     mod.linkSystemLibrary("z", .{});
     mod.linkSystemLibrary("zstd", .{});
 
@@ -1419,6 +1430,10 @@ const llvm_libs_xtensa = [_][]const u8{
     "LLVMXtensaDesc",
     "LLVMXtensaInfo",
 };
+const llvm_libs_polly = [_][]const u8{
+    "Polly",
+    "PollyISL",
+};
 
 fn generateLangRef(b: *std.Build) std.Build.LazyPath {
     const doctest_exe = b.addExecutable(.{
diff --git a/lib/std/Build/Step/Compile.zig b/lib/std/Build/Step/Compile.zig
index fc23d2da389a..b9cc970264f4 100644
--- a/lib/std/Build/Step/Compile.zig
+++ b/lib/std/Build/Step/Compile.zig
@@ -162,6 +162,11 @@ dead_strip_dylibs: bool = false,
 /// When enabled, outputs multiple .o files: filename.0.o, filename.1.o, etc.
 llvm_codegen_threads: u32 = 0,
 
+/// Skip the relocatable -r merge of partitioned LLVM output. The shard
+/// objects are emitted directly to `{emit}.{i}.o` for the downstream linker
+/// to consume. Only meaningful when `llvm_codegen_threads > 1`.
+llvm_no_merge_shards: bool = false,
+
 /// Skip linker step for build-obj - outputs raw LLVM object file(s).
 /// Saves time by avoiding parse/resolve/write cycle.
 no_link_obj: bool = false,
@@ -1532,6 +1537,9 @@ fn getZigArgs(compile: *Compile, fuzz: bool) ![][]const u8 {
     if (compile.llvm_codegen_threads > 0) {
         try zig_args.append(b.fmt("--llvm-codegen-threads={d}", .{compile.llvm_codegen_threads}));
     }
+    if (compile.llvm_no_merge_shards) {
+        try zig_args.append("--llvm-no-merge-shards");
+    }
     if (compile.no_link_obj) {
         try zig_args.append("--no-link");
     }
diff --git a/src/Air.zig b/src/Air.zig
index 77080386384d..97dcc52c44b9 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -2154,6 +2154,7 @@ pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index
 }
 
 pub const typesFullyResolved = types_resolved.typesFullyResolved;
+pub const resolveTypesFully = types_resolved.resolveTypesFully;
 pub const typeFullyResolved = types_resolved.checkType;
 pub const valFullyResolved = types_resolved.checkVal;
 pub const legalize = Legalize.legalize;
diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig
index 44669b82df87..8c4c69fa9c63 100644
--- a/src/Air/types_resolved.zig
+++ b/src/Air/types_resolved.zig
@@ -10,6 +10,21 @@ pub fn typesFullyResolved(air: Air, zcu: *Zcu) bool {
     return checkBody(air, air.getMainBody(), zcu);
 }
 
+/// Under parallel Sema, `resolve_type_fully` and `codegen_func` run
+/// concurrently, so types may be mid-resolution rather than failed. Walk the
+/// same AIR shape as `typesFullyResolved` but force-resolve each struct/union
+/// (blocking on `claimOrWait`-gated resolution). Returns false only if
+/// resolution itself errors.
+pub fn resolveTypesFully(air: Air, pt: Zcu.PerThread) bool {
+    tls_resolve_pt = pt;
+    defer tls_resolve_pt = null;
+    return checkBody(air, air.getMainBody(), pt.zcu);
+}
+
+/// `checkType` is reached via a long instruction walk; thread the optional
+/// PerThread via tls instead of plumbing it through every switch arm.
+threadlocal var tls_resolve_pt: ?Zcu.PerThread = null;
+
 fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
     const tags = air.instructions.items(.tag);
     const datas = air.instructions.items(.data);
@@ -513,6 +528,10 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool {
         },
         .@"struct" => switch (ip.indexToKey(ty.toIntern())) {
             .struct_type => {
+                if (tls_resolve_pt) |pt| {
+                    ty.resolveFully(pt) catch return false;
+                    return true;
+                }
                 const struct_obj = zcu.typeToStruct(ty).?;
                 return switch (struct_obj.layout) {
                     .@"packed" => struct_obj.backingIntTypeUnordered(ip) != .none,
@@ -530,6 +549,12 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool {
             },
             else => unreachable,
         },
-        .@"union" => return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved,
+        .@"union" => {
+            if (tls_resolve_pt) |pt| {
+                ty.resolveFully(pt) catch return false;
+                return true;
+            }
+            return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved;
+        },
     };
 }
diff --git a/src/Compilation.zig b/src/Compilation.zig
index 4f4362459c3b..b0519296e51b 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -43,6 +43,7 @@ const Zir = std.zig.Zir;
 const Air = @import("Air.zig");
 const Builtin = @import("Builtin.zig");
 const LlvmObject = @import("codegen/llvm.zig").Object;
+const LlvmPartitionSet = @import("codegen/llvm.zig").PartitionSet;
 const dev = @import("dev.zig");
 
 const DeprecatedLinearFifo = @import("deprecated.zig").LinearFifo;
@@ -125,6 +126,8 @@ work_queues: [
         break :len len;
     }
 ]DeprecatedLinearFifo(Job),
+/// Protects `work_queues` when Sema runs on worker threads and calls `queueJob`.
+work_queue_mutex: std.Thread.Mutex = .{},
 
 /// These jobs are to invoke the Clang compiler to create an object file, which
 /// gets linked with the Compilation.
@@ -265,7 +268,12 @@ link_prog_node: std.Progress.Node = std.Progress.Node.none,
 
 llvm_opt_bisect_limit: c_int,
 llvm_codegen_threads: u32,
+llvm_shard_stats: bool,
 no_link_obj: bool,
+/// When true, the N shard `.o` files emitted by partitioned LLVM codegen are
+/// left as-is (no relocatable -r merge). They land at `{emit}.{i}.o` next to
+/// the would-be merged output. The downstream linker consumes them directly.
+no_merge_shards: bool,
 
 time_report: ?TimeReport,
 
@@ -1729,7 +1737,9 @@ pub const CreateOptions = struct {
     linker_print_map: bool = false,
     llvm_opt_bisect_limit: i32 = -1,
     llvm_codegen_threads: u32 = 0,
+    llvm_shard_stats: bool = false,
     no_link_obj: bool = false,
+    llvm_no_merge_shards: bool = false,
     build_id: ?std.zig.BuildId = null,
     disable_c_depfile: bool = false,
     linker_z_nodelete: bool = false,
@@ -2298,7 +2308,15 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options
             .framework_dirs = options.framework_dirs,
             .llvm_opt_bisect_limit = options.llvm_opt_bisect_limit,
             .llvm_codegen_threads = options.llvm_codegen_threads,
-            .no_link_obj = options.no_link_obj,
+            .llvm_shard_stats = options.llvm_shard_stats,
+            // Partitioned LLVM output produces N objects which must be merged
+            // by the linker for a single-.o result, so the no-link shortcut
+            // does not apply unless `--llvm-no-merge-shards` is also set, in
+            // which case the N shard `.o` files are emitted directly to the
+            // final location and the relocatable merge is skipped entirely.
+            .no_link_obj = options.no_link_obj and
+                (options.llvm_codegen_threads <= 1 or options.llvm_no_merge_shards),
+            .no_merge_shards = options.llvm_no_merge_shards and options.llvm_codegen_threads > 1,
             .skip_linker_dependencies = options.skip_linker_dependencies,
             .queued_jobs = .{},
             .function_sections = options.function_sections,
@@ -2506,7 +2524,16 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options
 
         if (use_llvm) {
             if (opt_zcu) |zcu| {
-                zcu.llvm_object = try LlvmObject.create(arena, comp);
+                // Multi-shard emission only supports producing N object files
+                // for the linker; IR/BC/asm requests for a single output would
+                // silently drop shards 1..N. Clamp to 1 in that case.
+                const single_artifact_only = options.emit_bin == .no and
+                    (options.emit_llvm_ir != .no or options.emit_llvm_bc != .no or options.emit_asm != .no);
+                const n_shards: u32 = if (options.llvm_codegen_threads <= 1 or single_artifact_only)
+                    1
+                else
+                    options.llvm_codegen_threads;
+                zcu.llvm_object = try LlvmPartitionSet.create(arena, comp, n_shards);
             }
         }
 
@@ -3129,7 +3156,13 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE
                 try pt.populateTestFunctions();
             }
 
+            comp.phaseTimingC("update.processExports.start");
             try pt.processExports();
+            comp.phaseTimingC("update.processExports.done");
+        }
+
+        if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) {
+            comp.dumpLlvmShardStats(zcu);
         }
 
         if (build_options.enable_debug_extensions and comp.verbose_intern_pool) {
@@ -3267,6 +3300,65 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE
     }
 }
 
+fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void {
+    const ip = &zcu.intern_pool;
+    const n: u32 = if (comp.llvm_codegen_threads > 1) comp.llvm_codegen_threads else 16;
+    var counts = [_]u32{0} ** 256;
+    var top_file = [_]?*Zcu.File{null} ** 256;
+    var top_file_count = [_]u32{0} ** 256;
+
+    var per_file = std.AutoHashMap(*Zcu.File, u32).init(comp.gpa);
+    defer per_file.deinit();
+
+    const total_navs = ip.navCount();
+    var skipped: u32 = 0;
+    var i: u32 = 0;
+    while (i < total_navs) : (i += 1) {
+        const nav_index = ip.navIndexFromOrdinal(i);
+        const nav = ip.getNav(nav_index);
+        if (nav.status == .unresolved) {
+            skipped += 1;
+            continue;
+        }
+        const fqn = nav.fqn.toSlice(ip);
+        const shard: u8 = @intCast(std.hash.Wyhash.hash(0, fqn) % n);
+        counts[shard] += 1;
+        const file = zcu.fileByIndex(nav.srcInst(ip).resolveFile(ip));
+        const gop = per_file.getOrPut(file) catch continue;
+        if (!gop.found_existing) gop.value_ptr.* = 0;
+        gop.value_ptr.* += 1;
+        if (gop.value_ptr.* > top_file_count[shard]) {
+            top_file_count[shard] = gop.value_ptr.*;
+            top_file[shard] = file;
+        }
+    }
+
+    var min: u32 = std.math.maxInt(u32);
+    var max: u32 = 0;
+    var nonempty: u32 = 0;
+    for (counts[0..n]) |c| {
+        if (c == 0) continue;
+        nonempty += 1;
+        min = @min(min, c);
+        max = @max(max, c);
+    }
+    std.debug.print("llvm-shard-stats for '{s}': n={d} navs={d} skipped={d} nonempty_shards={d}\n", .{
+        comp.root_name, n, total_navs - skipped, skipped, nonempty,
+    });
+    for (counts[0..n], 0..) |c, s| {
+        if (c == 0) continue;
+        var buf: [512]u8 = undefined;
+        const key = if (top_file[s]) |f| f.shardKey(&buf) else "";
+        std.debug.print("  shard {d:>3}: {d:>6} navs  (top file '{s}' = {d})\n", .{
+            s, c, key, top_file_count[s],
+        });
+    }
+    if (min != std.math.maxInt(u32)) {
+        const ratio = @as(f64, @floatFromInt(max)) / @as(f64, @floatFromInt(min));
+        std.debug.print("  max/min ratio: {d:.2} (max={d}, min={d})\n", .{ ratio, max, min });
+    }
+}
+
 pub fn appendFileSystemInput(comp: *Compilation, path: Compilation.Path) Allocator.Error!void {
     const gpa = comp.gpa;
     const fsi = comp.file_system_inputs orelse return;
@@ -3336,6 +3428,7 @@ fn flush(
     arena: Allocator,
     tid: Zcu.PerThread.Id,
 ) Allocator.Error!void {
+    comp.phaseTimingC("flush.start");
     if (comp.zcu) |zcu| {
         if (zcu.llvm_object) |llvm_object| {
             const pt: Zcu.PerThread = .activate(zcu, tid);
@@ -3364,8 +3457,8 @@ fn flush(
             };
 
             // Generate parallel codegen output filenames if enabled
-            const bin_path_list: ?[]const [*:0]const u8 = if (comp.llvm_codegen_threads > 1 and base_bin_path != null) blk: {
-                const num_threads = comp.llvm_codegen_threads;
+            const bin_path_list: ?[]const [*:0]const u8 = if (llvm_object.n > 1 and base_bin_path != null) blk: {
+                const num_threads = llvm_object.n;
                 const list = try arena.alloc([*:0]const u8, num_threads);
                 const base_path_slice = std.mem.sliceTo(base_bin_path.?, 0);
 
@@ -3414,6 +3507,7 @@ fn flush(
                 error.LinkFailure => {}, // Already reported.
                 error.OutOfMemory => return error.OutOfMemory,
             };
+            comp.phaseTimingC("flush.llvm_emit_done");
         }
     }
     if (comp.bin_file) |lf| {
@@ -3430,7 +3524,14 @@ fn flush(
                 error.LinkFailure => {}, // Already reported.
                 error.OutOfMemory => return error.OutOfMemory,
             };
+        } else if (comp.no_merge_shards) {
+            // Shard objects went to `{emit}.{i}.o`; the 0-byte stub the linker
+            // created at `{emit}` during open() will never be flushed. Remove
+            // it so downstream build systems globbing `{emit}.*.o` aren't
+            // confused by an empty object alongside the real shards.
+            lf.emit.root_dir.handle.deleteFile(lf.emit.sub_path) catch {};
         }
+        comp.phaseTimingC("flush.lf_flush_done");
     }
     if (comp.zcu) |zcu| {
         try link.File.C.flushEmitH(zcu);
@@ -4629,10 +4730,20 @@ pub fn unableToLoadZcuFile(
     });
 }
 
+pub fn phaseTiming(label: []const u8) void {
+    if (!std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) return;
+    std.debug.print("[PHASE] {d} - {s}\n", .{ std.time.milliTimestamp(), label });
+}
+fn phaseTimingC(comp: *const Compilation, label: []const u8) void {
+    if (!std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) return;
+    std.debug.print("[PHASE] {d} {s} {s}\n", .{ std.time.milliTimestamp(), comp.root_name, label });
+}
+
 fn performAllTheWork(
     comp: *Compilation,
     main_progress_node: std.Progress.Node,
 ) JobError!void {
+    comp.phaseTimingC("performAllTheWork.start");
     // Regardless of errors, `comp.zcu` needs to update its generation number.
     defer if (comp.zcu) |zcu| {
         zcu.generation += 1;
@@ -4657,8 +4768,10 @@ fn performAllTheWork(
     var work_queue_wait_group: WaitGroup = .{};
     defer work_queue_wait_group.wait();
 
+    defer comp.phaseTimingC("performAllTheWork.codegen_wait_done");
     comp.link_task_wait_group.reset();
     defer comp.link_task_wait_group.wait();
+    defer comp.phaseTimingC("performAllTheWork.work_loop_done");
 
     // Already-queued prelink tasks
     comp.link_prog_node.increaseEstimatedTotalItems(comp.link_task_queue.queued_prelink.items.len);
@@ -5059,13 +5172,62 @@ fn performAllTheWork(
         // Start the timer for the "decls" part of the pipeline (Sema, CodeGen, link).
         decl_work_timer = comp.startTimer();
     }
+    comp.phaseTimingC("performAllTheWork.work_loop_start");
 
+    if (comp.zcu) |zcu| {
+        // Sub-compilations (compiler_rt, ubsan_rt, etc.) and the build runner
+        // are small and gain nothing from parallel Sema. For `zig build`, the
+        // runner is `root_mod` (main_mod is the user's build.zig).
+        const is_build_runner = std.mem.endsWith(u8, zcu.root_mod.root_src_path, "build_runner.zig");
+        zcu.parallel_sema = comp.parent_whole_cache == null and
+            !is_build_runner and
+            std.process.hasNonEmptyEnvVarConstant("ZIG_PARALLEL_SEMA");
+    }
+
+    var job_ns: [@typeInfo(Job.Tag).@"enum".fields.len]u64 = @splat(0);
+    var job_ct: [@typeInfo(Job.Tag).@"enum".fields.len]u64 = @splat(0);
+    var export_func_pass: u8 = 0;
     work: while (true) {
-        for (&comp.work_queues) |*work_queue| if (work_queue.readItem()) |job| {
+        const maybe_job: ?Job = job: {
+            comp.work_queue_mutex.lock();
+            defer comp.work_queue_mutex.unlock();
+            for (&comp.work_queues) |*work_queue| if (work_queue.readItem()) |job| break :job job;
+            break :job null;
+        };
+        if (maybe_job) |job| {
+            if (comp.zcu) |zcu| if (zcu.parallel_sema and job == .analyze_func) {
+                // Skip dispatch if a worker already holds this unit (or it has
+                // since been analyzed) — re-queues from the retry path can
+                // produce duplicate analyze_func jobs and N-1 workers then
+                // condvar-wait on the one analyzer.
+                const a = zcu.intern_pool.funcAnalysisUnordered(job.analyze_func);
+                if (a.is_analyzed) continue :work;
+                _ = zcu.sema_pending_jobs.rmw(.Add, 1, .acquire);
+                comp.thread_pool.spawnWgId(&comp.link_task_wait_group, workerAnalyzeFunc, .{ comp, job.analyze_func });
+                continue :work;
+            };
+            const t0 = if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) std.time.nanoTimestamp() else 0;
             try processOneJob(@intFromEnum(Zcu.PerThread.Id.main), comp, job);
+            if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) {
+                job_ns[@intFromEnum(@as(Job.Tag, job))] += @intCast(std.time.nanoTimestamp() - t0);
+                job_ct[@intFromEnum(@as(Job.Tag, job))] += 1;
+            }
             continue :work;
-        };
+        }
         if (comp.zcu) |zcu| {
+            if (zcu.sema_pending_jobs.load(.acquire) > 0) {
+                std.Thread.yield() catch {};
+                continue :work;
+            }
+            // A worker may have enqueued between our queue read and the
+            // counter dropping to zero; re-check the queues before exiting.
+            const drained = drained: {
+                comp.work_queue_mutex.lock();
+                defer comp.work_queue_mutex.unlock();
+                for (&comp.work_queues) |*q| if (q.count > 0) break :drained false;
+                break :drained true;
+            };
+            if (!drained) continue :work;
             // If there's no work queued, check if there's anything outdated
             // which we need to work on, and queue it if so.
             if (try zcu.findOutdatedToAnalyze()) |outdated| {
@@ -5080,16 +5242,43 @@ fn performAllTheWork(
                 });
                 continue;
             }
+            // Final pass under parallel Sema: any exported function whose body
+            // analysis was dropped by a post-commit retry will not be in
+            // `nav_map` at processExports time. Re-queue here so the work loop
+            // drains it before we exit.
+            if (zcu.parallel_sema and export_func_pass < 3) {
+                export_func_pass += 1;
+                var any_queued = false;
+                for (zcu.single_exports.values()) |idx| {
+                    any_queued = ensureExportFuncQueued(zcu, idx) or any_queued;
+                }
+                for (zcu.multi_exports.values()) |info| {
+                    for (info.index..info.index + info.len) |i| {
+                        any_queued = ensureExportFuncQueued(zcu, @enumFromInt(i)) or any_queued;
+                    }
+                }
+                if (any_queued) continue;
+            }
             zcu.sema_prog_node.end();
             zcu.sema_prog_node = .none;
         }
         break;
     }
+    if (comp.zcu) |zcu| zcu.parallel_sema = false;
+    if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) {
+        std.debug.print("=== work loop job timings (main thread) ===\n", .{});
+        inline for (@typeInfo(Job.Tag).@"enum".fields, 0..) |f, i| {
+            if (job_ct[i] != 0)
+                std.debug.print("  {s:>24}: {d:>6}ms ({d} jobs)\n", .{ f.name, job_ns[i] / 1_000_000, job_ct[i] });
+        }
+    }
 }
 
 const JobError = Allocator.Error;
 
 pub fn queueJob(comp: *Compilation, job: Job) !void {
+    comp.work_queue_mutex.lock();
+    defer comp.work_queue_mutex.unlock();
     try comp.work_queues[Job.stage(job)].writeItem(job);
 }
 
@@ -5108,7 +5297,20 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
                 comp.link_prog_node.completeOne();
                 air.deinit(gpa);
             }
-            if (!air.typesFullyResolved(zcu)) {
+            // Under serial Sema, FIFO dispatch guarantees every
+            // `resolve_type_fully` queued before this body's analysis has
+            // completed, so `typesFullyResolved == false` means the type
+            // *failed*. Under parallel Sema both job kinds run concurrently —
+            // a struct or union may simply be mid-resolution. Dropping the
+            // body would leave a dangling cross-shard `__N<nav>` undef.
+            // Force-resolve via `resolveTypesFully`, which blocks on the
+            // claimOrWait-gated resolution; drop only if that errors.
+            const types_ok: bool = if (zcu.parallel_sema) ok: {
+                const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
+                defer pt.deactivate();
+                break :ok air.resolveTypesFully(pt);
+            } else air.typesFullyResolved(zcu);
+            if (!types_ok) {
                 // Type resolution failed in a way which affects this function. This is a transitive
                 // failure, but it doesn't need recording, because this function semantically depends
                 // on the failed type, so when it is changed the function is updated.
@@ -5154,8 +5356,7 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
             const zcu = comp.zcu.?;
             const nav = zcu.intern_pool.getNav(nav_index);
             if (nav.analysis != null) {
-                const unit: InternPool.AnalUnit = .wrap(.{ .nav_val = nav_index });
-                if (zcu.failed_analysis.contains(unit) or zcu.transitive_failed_analysis.contains(unit)) {
+                if (zcu.anyAnalysisFailed(.wrap(.{ .nav_val = nav_index }))) {
                     comp.link_prog_node.completeOne();
                     return;
                 }
@@ -5192,9 +5393,16 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
             const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
             defer pt.deactivate();
 
+            Zcu.tls_retry_loop = null;
             pt.ensureFuncBodyUpToDate(func) catch |err| switch (err) {
                 error.OutOfMemory => |e| return e,
-                error.AnalysisFail => return,
+                error.AnalysisFail => {
+                    if (Zcu.tls_retry_loop != null) {
+                        Zcu.tls_retry_loop = null;
+                        try comp.queueJob(.{ .analyze_func = func });
+                    }
+                    return;
+                },
             };
         },
         .analyze_comptime_unit => |unit| {
@@ -5204,6 +5412,7 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
             const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
             defer pt.deactivate();
 
+            Zcu.tls_retry_loop = null;
             const maybe_err: Zcu.SemaError!void = switch (unit.unwrap()) {
                 .@"comptime" => |cu| pt.ensureComptimeUnitUpToDate(cu),
                 .nav_ty => |nav| pt.ensureNavTypeUpToDate(nav),
@@ -5214,7 +5423,13 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
             };
             maybe_err catch |err| switch (err) {
                 error.OutOfMemory => |e| return e,
-                error.AnalysisFail => return,
+                error.AnalysisFail => {
+                    if (Zcu.tls_retry_loop != null) {
+                        Zcu.tls_retry_loop = null;
+                        try comp.queueJob(.{ .analyze_comptime_unit = unit });
+                    }
+                    return;
+                },
             };
 
             queue_test_analysis: {
@@ -5242,9 +5457,16 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
 
             const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
             defer pt.deactivate();
+            Zcu.tls_retry_loop = null;
             Type.fromInterned(ty).resolveFully(pt) catch |err| switch (err) {
                 error.OutOfMemory => return error.OutOfMemory,
-                error.AnalysisFail => return,
+                error.AnalysisFail => {
+                    if (Zcu.tls_retry_loop != null) {
+                        Zcu.tls_retry_loop = null;
+                        try comp.queueJob(.{ .resolve_type_fully = ty });
+                    }
+                    return;
+                },
             };
         },
         .analyze_mod => |mod| {
@@ -5886,6 +6108,51 @@ pub const RtOptions = struct {
     allow_lto: bool = true,
 };
 
+fn ensureExportFuncQueued(zcu: *Zcu, export_idx: Zcu.Export.Index) bool {
+    const ip = &zcu.intern_pool;
+    const exp = export_idx.ptr(zcu);
+    const nav = switch (exp.exported) {
+        .nav => |n| n,
+        .uav => return false,
+    };
+    const v = switch (ip.getNav(nav).status) {
+        .fully_resolved => |r| r.val,
+        else => return false,
+    };
+    if (!ip.isFuncBody(v)) return false;
+    const func = ip.unwrapCoercedFunc(v);
+    // Check the LLVM nav_map: if the body landed there, codegen ran.
+    if (zcu.llvm_object) |llvm| {
+        const shard = zcu.navShard(nav, llvm.n);
+        if (llvm.objects[shard].nav_map.contains(nav)) return false;
+    } else if (ip.funcAnalysisUnordered(func).is_analyzed) return false;
+    // Clear is_analyzed so the fast-path doesn't no-op and re-analysis
+    // re-queues codegen_func.
+    zcu.funcInfo(func).clearAnalyzed(ip);
+    zcu.comp.queueJob(.{ .analyze_func = func }) catch return false;
+    return true;
+}
+
+fn workerAnalyzeFunc(tid: usize, comp: *Compilation, func: InternPool.Index) void {
+    const zcu = comp.zcu.?;
+    const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
+    defer pt.deactivate();
+    Zcu.tls_retry_loop = null;
+    pt.ensureFuncBodyUpToDate(func) catch |err| switch (err) {
+        error.OutOfMemory => comp.setAllocFailure(),
+        error.AnalysisFail => {
+            if (Zcu.tls_retry_loop != null) {
+                // Order-dependent dependency loop: re-queue this func so
+                // another thread (or a later attempt) can try after
+                // intermediates have been resolved independently.
+                Zcu.tls_retry_loop = null;
+                comp.queueJob(.{ .analyze_func = func }) catch comp.setAllocFailure();
+            }
+        },
+    };
+    _ = zcu.sema_pending_jobs.rmw(.Sub, 1, .release);
+}
+
 fn workerZcuCodegen(
     tid: usize,
     comp: *Compilation,
diff --git a/src/InternPool.zig b/src/InternPool.zig
index 751c42a0f41d..c5fe978be5fc 100644
--- a/src/InternPool.zig
+++ b/src/InternPool.zig
@@ -737,7 +737,11 @@ pub const Nav = struct {
             @"addrspace": std.builtin.AddressSpace,
             /// Populated only if `bits.status == .type_resolved`.
             is_threadlocal: bool,
-            _: u1 = 0,
+            /// Seqlock write-in-progress flag. `resolveNavType`/`resolveNavValue`
+            /// set this before mutating `type_or_val` and clear it on the final
+            /// `bits` store; `getNav` spins while it is set so it never returns
+            /// the transient prelude state.
+            writing: bool = false,
         };
 
         fn unpack(repr: Repr) Nav {
@@ -1313,6 +1317,12 @@ const Local = struct {
                     var new_list: ListSelf = .{ .bytes = @ptrCast(buf[bytes_offset..].ptr) };
                     new_list.header().* = .{ .capacity = capacity };
                     const len = mutable.mutate.len;
+                    // Hold the per-list mutex across the copy and release so a
+                    // cross-tid writer that locked this mutex and acquired the
+                    // shared view cannot have its write land in the old buffer
+                    // after this copy has already snapshotted it.
+                    mutable.mutate.mutex.lock();
+                    defer mutable.mutate.mutex.unlock();
                     // this cold, quickly predictable, condition enables
                     // the `MultiArrayList` optimization in `view`
                     if (len > 0) {
@@ -1320,8 +1330,6 @@ const Local = struct {
                         const new_slice = new_list.view().slice();
                         inline for (fields) |field| @memcpy(new_slice.items(field)[0..len], old_slice.items(field)[0..len]);
                     }
-                    mutable.mutate.mutex.lock();
-                    defer mutable.mutate.mutex.unlock();
                     mutable.list.release(new_list);
                 }
 
@@ -1961,24 +1969,43 @@ pub const OptionalNullTerminatedString = enum(u32) {
 /// * comptime-known value (where we store the value)
 /// * `Nav` val (so that we can analyze the value lazily)
 /// * `Nav` ref (so that we can analyze the reference lazily)
+/// Re-encode a tid_shift_32 index (e.g. `Index`, `Nav.Index`) into 30 bits
+/// using `tid_shift_30`, so it fits inside `CaptureValue.idx`. Safe because
+/// per-tid local arrays stay well below 2^(30 - tid_width) entries.
+fn repack32To30(ip: *const InternPool, raw32: u32) u30 {
+    if (single_threaded) return @intCast(raw32);
+    const tid = raw32 >> ip.tid_shift_32 & ip.getTidMask();
+    const idx = raw32 & ip.getIndexMask(u32);
+    return @intCast(@shlExact(tid, ip.tid_shift_30) | idx);
+}
+fn repack30To32(ip: *const InternPool, raw30: u30) u32 {
+    if (single_threaded) return raw30;
+    const tid = raw30 >> ip.tid_shift_30 & ip.getTidMask();
+    const idx = raw30 & ((@as(u32, 1) << ip.tid_shift_30) - 1);
+    return @shlExact(tid, ip.tid_shift_32) | idx;
+}
+
 pub const CaptureValue = packed struct(u32) {
     tag: enum(u2) { @"comptime", runtime, nav_val, nav_ref },
     idx: u30,
 
-    pub fn wrap(val: Unwrapped) CaptureValue {
+    pub fn wrap(ip: *const InternPool, val: Unwrapped) CaptureValue {
         return switch (val) {
+            // `Index` is already encoded with `tid_shift_30`, so it fits u30 directly.
             .@"comptime" => |i| .{ .tag = .@"comptime", .idx = @intCast(@intFromEnum(i)) },
             .runtime => |i| .{ .tag = .runtime, .idx = @intCast(@intFromEnum(i)) },
-            .nav_val => |i| .{ .tag = .nav_val, .idx = @intCast(@intFromEnum(i)) },
-            .nav_ref => |i| .{ .tag = .nav_ref, .idx = @intCast(@intFromEnum(i)) },
+            // `Nav.Index` is encoded with `tid_shift_32`; repack so the tid bits
+            // land inside u30.
+            .nav_val => |i| .{ .tag = .nav_val, .idx = repack32To30(ip, @intFromEnum(i)) },
+            .nav_ref => |i| .{ .tag = .nav_ref, .idx = repack32To30(ip, @intFromEnum(i)) },
         };
     }
-    pub fn unwrap(val: CaptureValue) Unwrapped {
+    pub fn unwrap(val: CaptureValue, ip: *const InternPool) Unwrapped {
         return switch (val.tag) {
             .@"comptime" => .{ .@"comptime" = @enumFromInt(val.idx) },
             .runtime => .{ .runtime = @enumFromInt(val.idx) },
-            .nav_val => .{ .nav_val = @enumFromInt(val.idx) },
-            .nav_ref => .{ .nav_ref = @enumFromInt(val.idx) },
+            .nav_val => .{ .nav_val = @enumFromInt(repack30To32(ip, val.idx)) },
+            .nav_ref => .{ .nav_ref = @enumFromInt(repack30To32(ip, val.idx)) },
         };
     }
 
@@ -2298,8 +2325,14 @@ pub const Key = union(enum) {
         /// Used for mutating that data.
         analysis_extra_index: u32,
         /// Index into extra array of the `zir_body_inst` corresponding to this function.
-        /// Used for mutating that data.
+        /// Used for mutating that data. For generic instances this index refers
+        /// to the generic owner's extra array, so it may live on a different
+        /// thread's local than `tid`.
         zir_body_inst_extra_index: u32,
+        /// Thread whose extra array `zir_body_inst_extra_index` refers to.
+        /// Equals `tid` for `func_decl`; equals the generic owner's tid for
+        /// `func_instance`.
+        zir_body_inst_tid: Zcu.PerThread.Id,
         /// Index into extra array of the resolved inferred error set for this function.
         /// Used for mutating that data.
         /// 0 when the function does not have an inferred error set.
@@ -2368,9 +2401,35 @@ pub const Key = union(enum) {
             @atomicStore(FuncAnalysis, analysis_ptr, analysis, .release);
         }
 
+        pub fn clearAnalyzed(func: Func, ip: *InternPool) void {
+            const extra_mutex = &ip.getLocal(func.tid).mutate.extra.mutex;
+            extra_mutex.lock();
+            defer extra_mutex.unlock();
+
+            const analysis_ptr = func.analysisPtr(ip);
+            var analysis = analysis_ptr.*;
+            analysis.is_analyzed = false;
+            @atomicStore(FuncAnalysis, analysis_ptr, analysis, .release);
+        }
+
+        /// Atomically set `is_queued`; returns true if WE set it (i.e. the
+        /// caller should enqueue the analyze_func job), false if already set.
+        pub fn trySetQueued(func: Func, ip: *InternPool) bool {
+            const extra_mutex = &ip.getLocal(func.tid).mutate.extra.mutex;
+            extra_mutex.lock();
+            defer extra_mutex.unlock();
+
+            const analysis_ptr = func.analysisPtr(ip);
+            var analysis = analysis_ptr.*;
+            if (analysis.is_queued or analysis.is_analyzed) return false;
+            analysis.is_queued = true;
+            @atomicStore(FuncAnalysis, analysis_ptr, analysis, .release);
+            return true;
+        }
+
         /// Returns a pointer that becomes invalid after any additions to the `InternPool`.
         fn zirBodyInstPtr(func: Func, ip: *const InternPool) *TrackedInst.Index {
-            const extra = ip.getLocalShared(func.tid).extra.acquire();
+            const extra = ip.getLocalShared(func.zir_body_inst_tid).extra.acquire();
             return @ptrCast(&extra.view().items(.@"0")[func.zir_body_inst_extra_index]);
         }
 
@@ -3388,7 +3447,9 @@ pub const LoadedUnionType = struct {
     }
 
     pub fn flagsUnordered(u: LoadedUnionType, ip: *const InternPool) Tag.TypeUnion.Flags {
-        return @atomicLoad(Tag.TypeUnion.Flags, u.flagsPtr(ip), .unordered);
+        // Despite the name, use .acquire so a true status-done flag synchronises
+        // with the prior release-store and makes preceding field writes visible.
+        return @atomicLoad(Tag.TypeUnion.Flags, u.flagsPtr(ip), .acquire);
     }
 
     pub fn setStatus(u: LoadedUnionType, ip: *InternPool, status: Status) void {
@@ -3546,15 +3607,35 @@ pub const LoadedUnionType = struct {
         ptr.* = new_zir_index;
     }
 
-    pub fn setFieldTypes(self: LoadedUnionType, ip: *const InternPool, types: []const Index) void {
+    pub fn setFieldTypes(self: LoadedUnionType, ip: *InternPool, types: []const Index) void {
+        const extra_mutex = &ip.getLocal(self.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
         @memcpy(self.field_types.get(ip), types);
     }
 
-    pub fn setFieldAligns(self: LoadedUnionType, ip: *const InternPool, aligns: []const Alignment) void {
+    pub fn setFieldAligns(self: LoadedUnionType, ip: *InternPool, aligns: []const Alignment) void {
         if (aligns.len == 0) return;
         assert(self.flagsUnordered(ip).any_aligned_fields);
+        const extra_mutex = &ip.getLocal(self.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
         @memcpy(self.field_aligns.get(ip), aligns);
     }
+
+    pub fn setFieldType(self: LoadedUnionType, ip: *InternPool, i: usize, ty: Index) void {
+        const extra_mutex = &ip.getLocal(self.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        self.field_types.get(ip)[i] = ty;
+    }
+
+    pub fn setFieldAlign(self: LoadedUnionType, ip: *InternPool, i: usize, a: Alignment) void {
+        const extra_mutex = &ip.getLocal(self.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        self.field_aligns.get(ip)[i] = a;
+    }
 };
 
 pub fn loadUnionType(ip: *const InternPool, index: Index) LoadedUnionType {
@@ -3654,11 +3735,17 @@ pub const LoadedStructType = struct {
             return @as(u1, @truncate(this.get(ip)[i / 32] >> @intCast(i % 32))) != 0;
         }
 
-        pub fn setBit(this: ComptimeBits, ip: *const InternPool, i: usize) void {
+        pub fn setBit(this: ComptimeBits, ip: *InternPool, i: usize) void {
+            const extra_mutex = &ip.getLocal(this.tid).mutate.extra.mutex;
+            extra_mutex.lock();
+            defer extra_mutex.unlock();
             this.get(ip)[i / 32] |= @as(u32, 1) << @intCast(i % 32);
         }
 
-        pub fn clearBit(this: ComptimeBits, ip: *const InternPool, i: usize) void {
+        pub fn clearBit(this: ComptimeBits, ip: *InternPool, i: usize) void {
+            const extra_mutex = &ip.getLocal(this.tid).mutate.extra.mutex;
+            extra_mutex.lock();
+            defer extra_mutex.unlock();
             this.get(ip)[i / 32] &= ~(@as(u32, 1) << @intCast(i % 32));
         }
     };
@@ -3724,10 +3811,94 @@ pub const LoadedStructType = struct {
         ip: *InternPool,
         name: NullTerminatedString,
     ) ?u32 {
-        const extra = ip.getLocalShared(s.tid).extra.acquire();
+        const local = ip.getLocal(s.tid);
+        local.mutate.extra.mutex.lock();
+        defer local.mutate.extra.mutex.unlock();
+        local.mutate.maps.mutex.lock();
+        defer local.mutate.maps.mutex.unlock();
+        const extra = local.shared.extra.acquire();
         return ip.addFieldName(extra, s.names_map.unwrap().?, s.field_names.start, name);
     }
 
+    pub fn setFieldType(s: LoadedStructType, ip: *InternPool, i: usize, ty: Index) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        s.field_types.get(ip)[i] = ty;
+    }
+
+    pub fn setFieldInit(s: LoadedStructType, ip: *InternPool, i: usize, init_val: Index) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        s.field_inits.get(ip)[i] = init_val;
+    }
+
+    pub fn setFieldAlign(s: LoadedStructType, ip: *InternPool, i: usize, a: Alignment) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        s.field_aligns.get(ip)[i] = a;
+    }
+
+    pub fn setFieldTypesAll(s: LoadedStructType, ip: *InternPool, types: []const Index) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        @memcpy(s.field_types.get(ip), types);
+    }
+
+    pub fn setOffset(s: LoadedStructType, ip: *InternPool, i: usize, off: u32) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        s.offsets.get(ip)[i] = off;
+    }
+
+    pub fn setOffsetsAll(s: LoadedStructType, ip: *InternPool, offs: []const u32) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        @memcpy(s.offsets.get(ip), offs);
+    }
+
+    pub fn setRuntimeOrderAll(s: LoadedStructType, ip: *InternPool, ro: []const RuntimeOrder) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        @memcpy(s.runtime_order.get(ip), ro);
+    }
+
+    pub fn setFieldInitsAll(s: LoadedStructType, ip: *InternPool, inits: []const Index) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        @memcpy(s.field_inits.get(ip), inits);
+    }
+
+    pub fn setFieldAlignsAll(s: LoadedStructType, ip: *InternPool, aligns: []const Alignment) void {
+        if (s.field_aligns.len == 0) return;
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        @memcpy(s.field_aligns.get(ip), aligns);
+    }
+
+    /// Publish field types and (optionally) aligns under one `extra.mutex`
+    /// hold so the per-field setters' lock churn is avoided.
+    pub fn setFieldTypesAlignsAll(
+        s: LoadedStructType,
+        ip: *InternPool,
+        types: []const Index,
+        aligns: ?[]const Alignment,
+    ) void {
+        const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
+        extra_mutex.lock();
+        defer extra_mutex.unlock();
+        @memcpy(s.field_types.get(ip), types);
+        if (aligns) |a| if (s.field_aligns.len != 0) @memcpy(s.field_aligns.get(ip), a);
+    }
+
     pub fn fieldAlign(s: LoadedStructType, ip: *const InternPool, i: usize) Alignment {
         if (s.field_aligns.len == 0) return .none;
         return s.field_aligns.get(ip)[i];
@@ -3742,7 +3913,8 @@ pub const LoadedStructType = struct {
     /// Returns `none` in the case the struct is a tuple.
     pub fn fieldName(s: LoadedStructType, ip: *const InternPool, i: usize) OptionalNullTerminatedString {
         if (s.field_names.len == 0) return .none;
-        return s.field_names.get(ip)[i].toOptional();
+        const names = s.field_names.get(ip);
+        return @atomicLoad(NullTerminatedString, &names[i], .acquire).toOptional();
     }
 
     pub fn fieldIsComptime(s: LoadedStructType, ip: *const InternPool, i: usize) bool {
@@ -3763,7 +3935,9 @@ pub const LoadedStructType = struct {
     }
 
     pub fn flagsUnordered(s: LoadedStructType, ip: *const InternPool) Tag.TypeStruct.Flags {
-        return @atomicLoad(Tag.TypeStruct.Flags, s.flagsPtr(ip), .unordered);
+        // Despite the name, use .acquire so a true status-done flag synchronises
+        // with the prior release-store and makes preceding field writes visible.
+        return @atomicLoad(Tag.TypeStruct.Flags, s.flagsPtr(ip), .acquire);
     }
 
     /// The returned pointer expires with any addition to the `InternPool`.
@@ -3776,7 +3950,7 @@ pub const LoadedStructType = struct {
     }
 
     pub fn packedFlagsUnordered(s: LoadedStructType, ip: *const InternPool) Tag.TypeStructPacked.Flags {
-        return @atomicLoad(Tag.TypeStructPacked.Flags, s.packedFlagsPtr(ip), .unordered);
+        return @atomicLoad(Tag.TypeStructPacked.Flags, s.packedFlagsPtr(ip), .acquire);
     }
 
     /// Reads the non-opv flag calculated during AstGen. Used to short-circuit more
@@ -4069,7 +4243,10 @@ pub const LoadedStructType = struct {
 
     pub fn haveFieldTypes(s: LoadedStructType, ip: *const InternPool) bool {
         const types = s.field_types.get(ip);
-        return types.len == 0 or types[types.len - 1] != .none;
+        if (types.len == 0) return true;
+        // Paired with release-store at the end of `structFields` so a true
+        // result here makes the preceding name/type slot writes visible.
+        return @atomicLoad(Index, &types[types.len - 1], .acquire) != .none;
     }
 
     pub fn haveFieldInits(s: LoadedStructType, ip: *const InternPool) bool {
@@ -6232,8 +6409,11 @@ pub const FuncAnalysis = packed struct(u32) {
     inferred_error_set: bool,
     disable_instrumentation: bool,
     disable_intrinsics: bool,
+    /// Under parallel Sema, set atomically by `ensureFuncBodyAnalysisQueued`
+    /// to dedupe work-queue dispatch without taking the global lock.
+    is_queued: bool = false,
 
-    _: u23 = 0,
+    _: u22 = 0,
 };
 
 pub const Bytes = struct {
@@ -6839,7 +7019,8 @@ pub fn init(ip: *InternPool, gpa: Allocator, available_threads: usize) !void {
     ip.tid_shift_30 = if (single_threaded) 0 else 30 - ip.tid_width;
     ip.tid_shift_31 = if (single_threaded) 0 else 31 - ip.tid_width;
     ip.tid_shift_32 = if (single_threaded) 0 else ip.tid_shift_31 +| 1;
-    ip.shards = try gpa.alloc(Shard, @as(usize, 1) << ip.tid_width);
+    const shard_count: usize = @max(@as(usize, 1) << ip.tid_width, 256);
+    ip.shards = try gpa.alloc(Shard, shard_count);
     @memset(ip.shards, .{
         .shared = .{
             .map = Shard.Map(Index).empty,
@@ -7534,6 +7715,7 @@ fn extraFuncDecl(tid: Zcu.PerThread.Id, extra: Local.Extra, extra_index: u32) Ke
         .uncoerced_ty = func_decl.data.ty,
         .analysis_extra_index = extra_index + std.meta.fieldIndex(P, "analysis").?,
         .zir_body_inst_extra_index = extra_index + std.meta.fieldIndex(P, "zir_body_inst").?,
+        .zir_body_inst_tid = tid,
         .resolved_error_set_extra_index = if (func_decl.data.analysis.inferred_error_set) func_decl.end else 0,
         .branch_quota_extra_index = 0,
         .owner_nav = func_decl.data.owner_nav,
@@ -7562,6 +7744,7 @@ fn extraFuncInstance(ip: *const InternPool, tid: Zcu.PerThread.Id, extra: Local.
         .uncoerced_ty = ty,
         .analysis_extra_index = analysis_extra_index,
         .zir_body_inst_extra_index = func_decl.zir_body_inst_extra_index,
+        .zir_body_inst_tid = func_decl.tid,
         .resolved_error_set_extra_index = if (analysis.inferred_error_set) end_extra_index else 0,
         .branch_quota_extra_index = extra_index + std.meta.fieldIndex(Tag.FuncInstance, "branch_quota").?,
         .owner_nav = owner_nav,
@@ -7618,6 +7801,9 @@ const GetOrPutKey = union(enum) {
         tid: Zcu.PerThread.Id,
         shard: *Shard,
         map_index: u32,
+        /// When true, the caller already holds `shard.mutate.map.mutex`
+        /// (via `lockShardsSorted`) and `putFinal`/`cancel` must not unlock.
+        prelocked: bool = false,
     },
 
     fn put(gop: *GetOrPutKey) Index {
@@ -7650,7 +7836,7 @@ const GetOrPutKey = union(enum) {
             .new => |info| {
                 assert(info.shard.shared.map.entries[info.map_index].value == index);
                 info.shard.mutate.map.len += 1;
-                info.shard.mutate.map.mutex.unlock();
+                if (!info.prelocked) info.shard.mutate.map.mutex.unlock();
                 gop.* = .{ .existing = index };
             },
         }
@@ -7659,7 +7845,7 @@ const GetOrPutKey = union(enum) {
     fn cancel(gop: *GetOrPutKey) void {
         switch (gop.*) {
             .existing => {},
-            .new => |info| info.shard.mutate.map.mutex.unlock(),
+            .new => |info| if (!info.prelocked) info.shard.mutate.map.mutex.unlock(),
         }
         gop.* = .{ .existing = undefined };
     }
@@ -7673,13 +7859,53 @@ const GetOrPutKey = union(enum) {
         gop.* = undefined;
     }
 };
+
+/// Sentinel stored in a namespace-type's `namespace` extra slot between
+/// `getStructType`/`getUnionType`/etc. publishing the index and the caller
+/// invoking `WipNamespaceType.finish`. Single-threaded Sema never observes
+/// it; with parallel Sema another thread that dedups to `.existing` may, and
+/// must spin via `awaitNamespaceTypeFinished` before reading namespace/name.
+pub const wip_namespace_sentinel: u32 = std.math.maxInt(u32);
+/// Written by `WipNamespaceType.cancel`/`WipEnumType.cancel` so a thread spinning
+/// in `awaitNamespaceTypeFinished` exits without mistaking the slot for a real
+/// `NamespaceIndex` (0 is a valid index).
+pub const cancelled_namespace_sentinel: u32 = std.math.maxInt(u32) - 1;
+
+/// Spin until `ty`'s namespace slot is no longer the wip sentinel.
+pub fn awaitNamespaceTypeFinished(ip: *const InternPool, ty: Index) void {
+    const ns_idx = ip.namespaceTypeNamespaceExtraIndex(ty) orelse return;
+    const unwrapped = ty.unwrap(ip);
+    while (true) {
+        // Re-acquire the shared view each iteration: the owning tid may
+        // realloc its extra array between `getStructType` and `finish`, which
+        // would leave a cached slot pointer dangling at the old buffer.
+        const extra = ip.getLocalShared(unwrapped.tid).extra.acquire();
+        const slot: *const u32 = &extra.view().items(.@"0")[ns_idx];
+        if (@atomicLoad(u32, slot, .acquire) != wip_namespace_sentinel) return;
+        std.atomic.spinLoopHint();
+    }
+}
+
+fn namespaceTypeNamespaceExtraIndex(ip: *const InternPool, ty: Index) ?u32 {
+    const unwrapped = ty.unwrap(ip);
+    const item = unwrapped.getItem(ip);
+    return switch (item.tag) {
+        .type_struct => item.data + std.meta.fieldIndex(Tag.TypeStruct, "namespace").?,
+        .type_struct_packed, .type_struct_packed_inits => item.data + std.meta.fieldIndex(Tag.TypeStructPacked, "namespace").?,
+        .type_union => item.data + std.meta.fieldIndex(Tag.TypeUnion, "namespace").?,
+        .type_opaque => item.data + std.meta.fieldIndex(Tag.TypeOpaque, "namespace").?,
+        .type_enum_auto => item.data + std.meta.fieldIndex(EnumAuto, "namespace").?,
+        .type_enum_explicit, .type_enum_nonexhaustive => item.data + std.meta.fieldIndex(EnumExplicit, "namespace").?,
+        else => null,
+    };
+}
 fn getOrPutKey(
     ip: *InternPool,
     gpa: Allocator,
     tid: Zcu.PerThread.Id,
     key: Key,
 ) Allocator.Error!GetOrPutKey {
-    return ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, key, 0);
+    return ip.getOrPutKeyInner(gpa, tid, key, 0, false);
 }
 fn getOrPutKeyEnsuringAdditionalCapacity(
     ip: *InternPool,
@@ -7687,6 +7913,28 @@ fn getOrPutKeyEnsuringAdditionalCapacity(
     tid: Zcu.PerThread.Id,
     key: Key,
     additional_capacity: u32,
+) Allocator.Error!GetOrPutKey {
+    return ip.getOrPutKeyInner(gpa, tid, key, additional_capacity, false);
+}
+/// Like `getOrPutKeyEnsuringAdditionalCapacity` but assumes the caller already
+/// holds `shard.mutate.map.mutex` for this key's shard (via `lockShardsSorted`).
+/// The returned `.new` will not unlock on `putFinal`/`cancel`/`deinit`.
+fn getOrPutKeyPrelocked(
+    ip: *InternPool,
+    gpa: Allocator,
+    tid: Zcu.PerThread.Id,
+    key: Key,
+    additional_capacity: u32,
+) Allocator.Error!GetOrPutKey {
+    return ip.getOrPutKeyInner(gpa, tid, key, additional_capacity, true);
+}
+fn getOrPutKeyInner(
+    ip: *InternPool,
+    gpa: Allocator,
+    tid: Zcu.PerThread.Id,
+    key: Key,
+    additional_capacity: u32,
+    prelocked: bool,
 ) Allocator.Error!GetOrPutKey {
     const full_hash = key.hash64(ip);
     const hash: u32 = @truncate(full_hash >> 32);
@@ -7695,17 +7943,19 @@ fn getOrPutKeyEnsuringAdditionalCapacity(
     const Map = @TypeOf(map);
     var map_mask = map.header().mask();
     var map_index = hash;
-    while (true) : (map_index += 1) {
-        map_index &= map_mask;
-        const entry = &map.entries[map_index];
-        const index = entry.acquire();
-        if (index == .none) break;
-        if (entry.hash != hash) continue;
-        if (index.unwrap(ip).getTag(ip) == .removed) continue;
-        if (ip.indexToKey(index).eql(key, ip)) return .{ .existing = index };
+    if (!prelocked) {
+        while (true) : (map_index += 1) {
+            map_index &= map_mask;
+            const entry = &map.entries[map_index];
+            const index = entry.acquire();
+            if (index == .none) break;
+            if (entry.hash != hash) continue;
+            if (index.unwrap(ip).getTag(ip) == .removed) continue;
+            if (ip.indexToKey(index).eql(key, ip)) return .{ .existing = index };
+        }
+        shard.mutate.map.mutex.lock();
     }
-    shard.mutate.map.mutex.lock();
-    errdefer shard.mutate.map.mutex.unlock();
+    errdefer if (!prelocked) shard.mutate.map.mutex.unlock();
     if (map.entries != shard.shared.map.entries) {
         map = shard.shared.map;
         map_mask = map.header().mask();
@@ -7718,7 +7968,7 @@ fn getOrPutKeyEnsuringAdditionalCapacity(
         if (index == .none) break;
         if (entry.hash != hash) continue;
         if (ip.indexToKey(index).eql(key, ip)) {
-            defer shard.mutate.map.mutex.unlock();
+            if (!prelocked) shard.mutate.map.mutex.unlock();
             return .{ .existing = index };
         }
     }
@@ -7774,8 +8024,40 @@ fn getOrPutKeyEnsuringAdditionalCapacity(
         .tid = tid,
         .shard = shard,
         .map_index = map_index,
+        .prelocked = prelocked,
     } };
 }
+
+/// Compute the shard index for `key` (matches `getOrPutKeyInner`).
+fn keyShardIndex(ip: *const InternPool, key: Key) u32 {
+    return @intCast(key.hash64(ip) & (ip.shards.len - 1));
+}
+
+/// Lock the shard mutexes for the given keys in ascending shard-index order
+/// so concurrent multi-key inserters cannot ABBA-deadlock. Returns the count
+/// of distinct shards locked, written to `out`. Caller must unlock each via
+/// `ip.shards[out[i]].mutate.map.mutex.unlock()` in any order.
+fn lockShardsSorted(
+    ip: *InternPool,
+    keys: []const Key,
+    out: []u32,
+) usize {
+    assert(out.len >= keys.len);
+    var n: usize = 0;
+    for (keys) |k| {
+        const s = ip.keyShardIndex(k);
+        // dedupe: skip if already present
+        for (out[0..n]) |existing| {
+            if (existing == s) break;
+        } else {
+            out[n] = s;
+            n += 1;
+        }
+    }
+    std.mem.sort(u32, out[0..n], {}, std.sort.asc(u32));
+    for (out[0..n]) |s| ip.shards[s].mutate.map.mutex.lock();
+    return n;
+}
 /// Like `getOrPutKey`, but asserts that the key already exists, and prepares to replace
 /// its shard entry with a new `Index` anyway. After finalizing this, the old index remains
 /// valid (in that `indexToKey` and similar queries will behave as before), but it will
@@ -7843,6 +8125,7 @@ pub fn get(ip: *InternPool, gpa: Allocator, tid: Zcu.PerThread.Id, key: Key) All
                 new_key.ptr_type.flags.size = .many;
                 const ptr_type_index = try ip.get(gpa, tid, new_key);
                 gop = try ip.getOrPutKey(gpa, tid, key);
+                if (gop == .existing) return gop.existing;
 
                 try items.ensureUnusedCapacity(1);
                 items.appendAssumeCapacity(.{
@@ -8099,6 +8382,7 @@ pub fn get(ip: *InternPool, gpa: Allocator, tid: Zcu.PerThread.Id, key: Key) All
                         .storage = .{ .u64 = base_index.index },
                     } });
                     gop = try ip.getOrPutKey(gpa, tid, key);
+                    if (gop == .existing) return gop.existing;
                     try items.ensureUnusedCapacity(1);
                     items.appendAssumeCapacity(.{
                         .tag = switch (ptr.base_addr) {
@@ -8522,6 +8806,7 @@ pub fn get(ip: *InternPool, gpa: Allocator, tid: Zcu.PerThread.Id, key: Key) All
                             .storage = .{ .u64 = bytes.at(0, ip) },
                         } });
                         gop = try ip.getOrPutKey(gpa, tid, key);
+                        if (gop == .existing) return gop.existing;
                         try items.ensureUnusedCapacity(1);
                         break :elem elem;
                     },
@@ -8746,7 +9031,7 @@ pub fn getUnionType(
         .padding = std.math.maxInt(u32),
         .name = undefined, // set by `finish`
         .name_nav = undefined, // set by `finish`
-        .namespace = undefined, // set by `finish`
+        .namespace = @enumFromInt(wip_namespace_sentinel), // set by `finish`
         .tag_ty = ini.enum_tag_ty,
         .zir_index = switch (ini.key) {
             inline else => |x| x.zir_index,
@@ -8831,13 +9116,27 @@ pub const WipNamespaceType = struct {
         const extra = ip.getLocalShared(wip.tid).extra.acquire();
         const extra_items = extra.view().items(.@"0");
 
-        extra_items[wip.namespace_extra_index] = @intFromEnum(namespace);
+        // Release-store so a concurrent reader spinning in
+        // `awaitNamespaceTypeFinished` sees this and the prior `setName`
+        // writes once the sentinel is replaced.
+        @atomicStore(u32, &extra_items[wip.namespace_extra_index], @intFromEnum(namespace), .release);
 
         return wip.index;
     }
 
     pub fn cancel(wip: WipNamespaceType, ip: *InternPool, tid: Zcu.PerThread.Id) void {
+        const extra = ip.getLocalShared(wip.tid).extra.acquire();
+        const extra_items = extra.view().items(.@"0");
+        // If `finish` was already called the index is published; another
+        // thread may be using it, so removal is unsafe. Leave it.
+        if (@atomicLoad(u32, &extra_items[wip.namespace_extra_index], .acquire) != wip_namespace_sentinel) return;
+        // Tombstone the item first so a concurrent thread that already holds
+        // `.existing` and is about to read this index sees `.removed` instead
+        // of a half-dead entry, then publish `cancelled_namespace_sentinel`
+        // (NOT 0, which is a real `NamespaceIndex`) so spinners in
+        // `awaitNamespaceTypeFinished` exit without dereferencing namespace 0.
         ip.remove(tid, wip.index);
+        @atomicStore(u32, &extra_items[wip.namespace_extra_index], cancelled_namespace_sentinel, .release);
     }
 
     pub const Result = union(enum) {
@@ -8932,7 +9231,7 @@ pub fn getStructType(
                 .name_nav = undefined, // set by `finish`
                 .zir_index = zir_index,
                 .fields_len = ini.fields_len,
-                .namespace = undefined, // set by `finish`
+                .namespace = @enumFromInt(wip_namespace_sentinel), // set by `finish`
                 .backing_int_ty = .none,
                 .names_map = names_map,
                 .flags = .{
@@ -8999,7 +9298,7 @@ pub fn getStructType(
         .name = undefined, // set by `finish`
         .name_nav = undefined, // set by `finish`
         .zir_index = zir_index,
-        .namespace = undefined, // set by `finish`
+        .namespace = @enumFromInt(wip_namespace_sentinel), // set by `finish`
         .fields_len = ini.fields_len,
         .size = std.math.maxInt(u32),
         .flags = .{
@@ -9459,9 +9758,19 @@ pub fn getFuncDeclIes(
         extra.mutate.len = prev_extra_len;
     }
 
-    var func_gop = try ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, .{
-        .func = extraFuncDecl(tid, extra.list.*, func_decl_extra_index),
-    }, 3);
+    const func_key: Key = .{ .func = extraFuncDecl(tid, extra.list.*, func_decl_extra_index) };
+    const eu_key: Key = .{ .error_union_type = .{
+        .error_set_type = error_set_type,
+        .payload_type = key.bare_return_type,
+    } };
+    const es_key: Key = .{ .inferred_error_set_type = func_index };
+    const fty_key: Key = .{ .func_type = extraFuncType(tid, extra.list.*, func_type_extra_index) };
+
+    var locked_shards: [4]u32 = undefined;
+    const n_locked = ip.lockShardsSorted(&.{ func_key, eu_key, es_key, fty_key }, &locked_shards);
+    defer for (locked_shards[0..n_locked]) |s| ip.shards[s].mutate.map.mutex.unlock();
+
+    var func_gop = try ip.getOrPutKeyPrelocked(gpa, tid, func_key, 3);
     defer func_gop.deinit();
     if (func_gop == .existing) {
         // An existing function type was found; undo the additions to our two arrays.
@@ -9480,20 +9789,13 @@ pub fn getFuncDeclIes(
         return func_gop.existing;
     }
     func_gop.putTentative(func_index);
-    var error_union_type_gop = try ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, .{ .error_union_type = .{
-        .error_set_type = error_set_type,
-        .payload_type = key.bare_return_type,
-    } }, 2);
+    var error_union_type_gop = try ip.getOrPutKeyPrelocked(gpa, tid, eu_key, 2);
     defer error_union_type_gop.deinit();
     error_union_type_gop.putTentative(error_union_type);
-    var error_set_type_gop = try ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, .{
-        .inferred_error_set_type = func_index,
-    }, 1);
+    var error_set_type_gop = try ip.getOrPutKeyPrelocked(gpa, tid, es_key, 1);
     defer error_set_type_gop.deinit();
     error_set_type_gop.putTentative(error_set_type);
-    var func_ty_gop = try ip.getOrPutKey(gpa, tid, .{
-        .func_type = extraFuncType(tid, extra.list.*, func_type_extra_index),
-    });
+    var func_ty_gop = try ip.getOrPutKeyPrelocked(gpa, tid, fty_key, 0);
     defer func_ty_gop.deinit();
     func_ty_gop.putTentative(func_ty);
 
@@ -9755,9 +10057,21 @@ pub fn getFuncInstanceIes(
         extra.mutate.len = prev_extra_len;
     }
 
-    var func_gop = try ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, .{
-        .func = ip.extraFuncInstance(tid, extra.list.*, func_extra_index),
-    }, 3);
+    const func_key: Key = .{ .func = ip.extraFuncInstance(tid, extra.list.*, func_extra_index) };
+    const eu_key: Key = .{ .error_union_type = .{
+        .error_set_type = error_set_type,
+        .payload_type = arg.bare_return_type,
+    } };
+    const es_key: Key = .{ .inferred_error_set_type = func_index };
+    const fty_key: Key = .{ .func_type = extraFuncType(tid, extra.list.*, func_type_extra_index) };
+
+    // Four shard mutexes are held simultaneously below; lock in sorted order
+    // so concurrent callers cannot ABBA-deadlock.
+    var locked_shards: [4]u32 = undefined;
+    const n_locked = ip.lockShardsSorted(&.{ func_key, eu_key, es_key, fty_key }, &locked_shards);
+    defer for (locked_shards[0..n_locked]) |s| ip.shards[s].mutate.map.mutex.unlock();
+
+    var func_gop = try ip.getOrPutKeyPrelocked(gpa, tid, func_key, 3);
     defer func_gop.deinit();
     if (func_gop == .existing) {
         // Hot path: undo the additions to our two arrays.
@@ -9766,20 +10080,13 @@ pub fn getFuncInstanceIes(
         return func_gop.existing;
     }
     func_gop.putTentative(func_index);
-    var error_union_type_gop = try ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, .{ .error_union_type = .{
-        .error_set_type = error_set_type,
-        .payload_type = arg.bare_return_type,
-    } }, 2);
+    var error_union_type_gop = try ip.getOrPutKeyPrelocked(gpa, tid, eu_key, 2);
     defer error_union_type_gop.deinit();
     error_union_type_gop.putTentative(error_union_type);
-    var error_set_type_gop = try ip.getOrPutKeyEnsuringAdditionalCapacity(gpa, tid, .{
-        .inferred_error_set_type = func_index,
-    }, 1);
+    var error_set_type_gop = try ip.getOrPutKeyPrelocked(gpa, tid, es_key, 1);
     defer error_set_type_gop.deinit();
     error_set_type_gop.putTentative(error_set_type);
-    var func_ty_gop = try ip.getOrPutKey(gpa, tid, .{
-        .func_type = extraFuncType(tid, extra.list.*, func_type_extra_index),
-    });
+    var func_ty_gop = try ip.getOrPutKeyPrelocked(gpa, tid, fty_key, 0);
     defer func_ty_gop.deinit();
     func_ty_gop.putTentative(func_ty);
     try finishFuncInstance(
@@ -9815,14 +10122,26 @@ fn finishFuncInstance(
     const nav_name = try ip.getOrPutStringFmt(gpa, tid, "{f}__anon_{d}", .{
         fn_owner_nav.name.fmt(ip), @intFromEnum(func_index),
     }, .no_embedded_nulls);
+    // The generic owner's nav is normally `.fully_resolved` by the time we
+    // reach instantiation, but under parallel Sema another thread may hold the
+    // generic's func index (via an alias nav) before the owner nav itself
+    // commits. The modifiers we need are present from `.type_resolved`
+    // onward; switch instead of unconditionally accessing `.fully_resolved`.
+    const owner_mods = switch (fn_owner_nav.status) {
+        .fully_resolved => |r| .{ r.is_const, r.alignment, r.@"linksection", r.@"addrspace" },
+        .type_resolved => |r| .{ r.is_const, r.alignment, r.@"linksection", r.@"addrspace" },
+        // The `getNav` seqlock guarantees we never see the transient prelude
+        // state; a genuine `.unresolved` cannot reach instantiation.
+        .unresolved => unreachable,
+    };
     const nav_index = try ip.createNav(gpa, tid, .{
         .name = nav_name,
         .fqn = try ip.namespacePtr(fn_namespace).internFullyQualifiedName(ip, gpa, tid, nav_name),
         .val = func_index,
-        .is_const = fn_owner_nav.status.fully_resolved.is_const,
-        .alignment = fn_owner_nav.status.fully_resolved.alignment,
-        .@"linksection" = fn_owner_nav.status.fully_resolved.@"linksection",
-        .@"addrspace" = fn_owner_nav.status.fully_resolved.@"addrspace",
+        .is_const = owner_mods[0],
+        .alignment = owner_mods[1],
+        .@"linksection" = owner_mods[2],
+        .@"addrspace" = owner_mods[3],
     });
 
     // Populate the owner_nav field which was left undefined until now.
@@ -9884,7 +10203,7 @@ pub const WipEnumType = struct {
         const extra = ip.getLocalShared(wip.tid).extra.acquire();
         const extra_items = extra.view().items(.@"0");
 
-        extra_items[wip.namespace_extra_index] = @intFromEnum(namespace);
+        @atomicStore(u32, &extra_items[wip.namespace_extra_index], @intFromEnum(namespace), .release);
     }
 
     pub fn setTagTy(wip: WipEnumType, ip: *InternPool, tag_ty: Index) void {
@@ -9926,7 +10245,12 @@ pub const WipEnumType = struct {
     }
 
     pub fn cancel(wip: WipEnumType, ip: *InternPool, tid: Zcu.PerThread.Id) void {
+        const extra = ip.getLocalShared(wip.tid).extra.acquire();
+        const extra_items = extra.view().items(.@"0");
+        if (@atomicLoad(u32, &extra_items[wip.namespace_extra_index], .acquire) != wip_namespace_sentinel) return;
+        // See `WipNamespaceType.cancel`.
         ip.remove(tid, wip.index);
+        @atomicStore(u32, &extra_items[wip.namespace_extra_index], cancelled_namespace_sentinel, .release);
     }
 
     pub const Result = union(enum) {
@@ -9993,7 +10317,7 @@ pub fn getEnumType(
                     inline .declared, .declared_owned_captures => |d| @intCast(d.captures.len),
                     .reified => std.math.maxInt(u32),
                 },
-                .namespace = undefined, // set by `prepare`
+                .namespace = @enumFromInt(wip_namespace_sentinel), // set by `prepare`
                 .int_tag_type = .none, // set by `prepare`
                 .fields_len = ini.fields_len,
                 .names_map = names_map,
@@ -10052,7 +10376,7 @@ pub fn getEnumType(
                     inline .declared, .declared_owned_captures => |d| @intCast(d.captures.len),
                     .reified => std.math.maxInt(u32),
                 },
-                .namespace = undefined, // set by `prepare`
+                .namespace = @enumFromInt(wip_namespace_sentinel), // set by `prepare`
                 .int_tag_type = .none, // set by `prepare`
                 .fields_len = ini.fields_len,
                 .names_map = names_map,
@@ -10263,7 +10587,7 @@ pub fn getOpaqueType(
     const extra_index = addExtraAssumeCapacity(extra, Tag.TypeOpaque{
         .name = undefined, // set by `finish`
         .name_nav = undefined, // set by `finish`
-        .namespace = undefined, // set by `finish`
+        .namespace = @enumFromInt(wip_namespace_sentinel), // set by `finish`
         .zir_index = switch (ini.key) {
             inline else => |x| x.zir_index,
         },
@@ -10353,7 +10677,11 @@ pub fn remove(ip: *InternPool, tid: Zcu.PerThread.Id, index: Index) void {
     if (unwrapped_index.tid == tid) {
         const items_len = &ip.getLocal(unwrapped_index.tid).mutate.items.len;
         if (unwrapped_index.index == items_len.* - 1) {
-            // Happy case - we can just drop the item without affecting any other indices.
+            // Tombstone first so a stale shard-map entry that still points
+            // here is skipped by the `.removed` check in the lockless probe
+            // even after this slot is reused by the next append.
+            const items = ip.getLocalShared(unwrapped_index.tid).items.acquire().view();
+            @atomicStore(Tag, &items.items(.tag)[unwrapped_index.index], .removed, .release);
             items_len.* -= 1;
             return;
         }
@@ -11488,7 +11816,49 @@ pub fn dumpGenericInstancesFallible(ip: *const InternPool, allocator: Allocator)
 pub fn getNav(ip: *const InternPool, index: Nav.Index) Nav {
     const unwrapped = index.unwrap(ip);
     const navs = ip.getLocalShared(unwrapped.tid).navs.acquire();
-    return navs.view().get(unwrapped.index).unpack();
+    const view = navs.view();
+    const bits_ptr = &view.items(.bits)[unwrapped.index];
+    const tov_ptr = &view.items(.type_or_val)[unwrapped.index];
+    const ls_ptr = &view.items(.@"linksection")[unwrapped.index];
+    var repr = view.get(unwrapped.index);
+    // Seqlock-style read: bits is release-stored last by both
+    // `resolveNavType` and `resolveNavValue`. The .type_resolved →
+    // .fully_resolved transition rewrites `type_or_val` from a type to a
+    // value, so a single acquire is not enough — re-read bits after
+    // type_or_val and retry until stable so we never pair an old status
+    // with a new payload (or vice versa).
+    while (true) {
+        const b1 = @atomicLoad(Nav.Repr.Bits, bits_ptr, .acquire);
+        repr.type_or_val = @atomicLoad(InternPool.Index, tov_ptr, .unordered);
+        repr.@"linksection" = @atomicLoad(OptionalNullTerminatedString, ls_ptr, .unordered);
+        const b2 = @atomicLoad(Nav.Repr.Bits, bits_ptr, .acquire);
+        if (!b1.writing and @as(u16, @bitCast(b1)) == @as(u16, @bitCast(b2))) {
+            repr.bits = b2;
+            return repr.unpack();
+        }
+        std.atomic.spinLoopHint();
+    }
+}
+
+/// Total number of Navs across all per-thread locals. Intended for diagnostics.
+pub fn navCount(ip: *const InternPool) u32 {
+    var total: u32 = 0;
+    for (ip.locals) |*local| total += local.mutate.navs.len;
+    return total;
+}
+
+/// Construct a Nav.Index from a flat ordinal in [0, navCount()). Intended for
+/// diagnostics that need to enumerate every Nav; not stable across updates.
+pub fn navIndexFromOrdinal(ip: *const InternPool, ordinal: u32) Nav.Index {
+    var rem = ordinal;
+    for (ip.locals, 0..) |*local, tid| {
+        const len = local.mutate.navs.len;
+        if (rem < len) {
+            return @enumFromInt(@shlExact(@as(u32, @intCast(tid)), ip.tid_shift_32) | rem);
+        }
+        rem -= len;
+    }
+    unreachable;
 }
 
 pub fn namespacePtr(ip: *InternPool, namespace_index: NamespaceIndex) *Zcu.Namespace {
@@ -11611,10 +11981,10 @@ pub fn resolveNavType(
     const unwrapped = nav.unwrap(ip);
 
     const local = ip.getLocal(unwrapped.tid);
-    local.mutate.extra.mutex.lock();
-    defer local.mutate.extra.mutex.unlock();
+    local.mutate.navs.mutex.lock();
+    defer local.mutate.navs.mutex.unlock();
 
-    const navs = local.shared.navs.view();
+    const navs = local.shared.navs.acquire().view();
 
     const nav_analysis_namespace = navs.items(.analysis_namespace);
     const nav_analysis_zir_index = navs.items(.analysis_zir_index);
@@ -11653,10 +12023,10 @@ pub fn resolveNavValue(
     const unwrapped = nav.unwrap(ip);
 
     const local = ip.getLocal(unwrapped.tid);
-    local.mutate.extra.mutex.lock();
-    defer local.mutate.extra.mutex.unlock();
+    local.mutate.navs.mutex.lock();
+    defer local.mutate.navs.mutex.unlock();
 
-    const navs = local.shared.navs.view();
+    const navs = local.shared.navs.acquire().view();
 
     const nav_analysis_namespace = navs.items(.analysis_namespace);
     const nav_analysis_zir_index = navs.items(.analysis_zir_index);
@@ -11667,10 +12037,18 @@ pub fn resolveNavValue(
     assert(nav_analysis_namespace[unwrapped.index] != .none);
     assert(nav_analysis_zir_index[unwrapped.index] != .none);
 
+    // Seqlock-style write paired with the loop in `getNav`: invalidate `bits`
+    // before mutating `type_or_val` so a concurrent reader cannot pair the old
+    // `.type_resolved` status with the new value (it will see b1 != b2 and
+    // retry). The other `bits` fields are unchanged by this prelude store.
+    var bits = nav_bits[unwrapped.index];
+    bits.writing = true;
+    @atomicStore(Nav.Repr.Bits, &nav_bits[unwrapped.index], bits, .release);
+
     @atomicStore(InternPool.Index, &nav_vals[unwrapped.index], resolved.val, .release);
     @atomicStore(OptionalNullTerminatedString, &nav_linksections[unwrapped.index], resolved.@"linksection", .release);
 
-    var bits = nav_bits[unwrapped.index];
+    bits.writing = false;
     bits.status = .fully_resolved;
     bits.is_const = resolved.is_const;
     bits.alignment = resolved.alignment;
@@ -12740,8 +13118,16 @@ pub fn addFieldName(
     const strings = extra_items[names_start..][0..field_index];
     const adapter: NullTerminatedString.Adapter = .{ .strings = @ptrCast(strings) };
     const gop = map.getOrPutAssumeCapacityAdapted(name, adapter);
-    if (gop.found_existing) return @intCast(gop.index);
-    extra_items[names_start + field_index] = @intFromEnum(name);
+    if (gop.found_existing) {
+        // Re-store the slot so a re-run of `structFields` after a retry has
+        // its own happens-before edge to this slot via the subsequent
+        // flags release-store; otherwise a reader synchronising on the
+        // re-run's flags may not transitively see the original writer's
+        // slot write under weak memory.
+        @atomicStore(u32, &extra_items[names_start + gop.index], @intFromEnum(name), .release);
+        return @intCast(gop.index);
+    }
+    @atomicStore(u32, &extra_items[names_start + field_index], @intFromEnum(name), .release);
     return null;
 }
 
diff --git a/src/Sema.zig b/src/Sema.zig
index db99141202b9..fc8bfe47deee 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -117,11 +117,14 @@ comptime_allocs: std.ArrayListUnmanaged(ComptimeAlloc) = .empty,
 /// these are flushed to `Zcu.single_exports` or `Zcu.multi_exports`.
 exports: std.ArrayListUnmanaged(Zcu.Export) = .empty,
 
-/// All references registered so far by this `Sema`. This is a temporary duplicate
-/// of data stored in `Zcu.all_references`. It exists to avoid adding references to
-/// a given `AnalUnit` multiple times.
-references: std.AutoArrayHashMapUnmanaged(AnalUnit, void) = .empty,
-type_references: std.AutoArrayHashMapUnmanaged(InternPool.Index, void) = .empty,
+/// All references registered so far by this `Sema`. Buffered locally and
+/// flushed to `Zcu.all_references` in `flushExports` so reference recording
+/// is lock-free during the body.
+references: std.AutoArrayHashMapUnmanaged(AnalUnit, struct {
+    src: LazySrcLoc,
+    inline_frame: Zcu.InlineReferenceFrame.Index.Optional,
+}) = .empty,
+type_references: std.AutoArrayHashMapUnmanaged(InternPool.Index, LazySrcLoc) = .empty,
 
 /// All dependencies registered so far by this `Sema`. This is a temporary duplicate
 /// of the main dependency data. It exists to avoid adding dependencies to a given
@@ -2649,14 +2652,12 @@ pub fn failWithOwnedErrorMsg(sema: *Sema, block: ?*Block, err_msg: *Zcu.ErrorMsg
 
     err_msg.reference_trace_root = sema.owner.toOptional();
 
-    const gop = try zcu.failed_analysis.getOrPut(gpa, sema.owner);
-    if (gop.found_existing) {
+    if (try zcu.failedAnalysisGetOrPut(sema.owner, err_msg)) {
         // If there are multiple errors for the same Decl, prefer the first one added.
         sema.err = null;
         err_msg.destroy(gpa);
     } else {
         sema.err = err_msg;
-        gop.value_ptr.* = err_msg;
     }
 
     return error.AnalysisFail;
@@ -2881,7 +2882,7 @@ fn getCaptures(sema: *Sema, block: *Block, type_src: LazySrcLoc, extra_index: us
         const zir_name_slice = sema.code.nullTerminatedString(zir_name);
         capture.* = switch (zir_capture.unwrap()) {
             .nested => |parent_idx| parent_captures.get(ip)[parent_idx],
-            .instruction_load => |ptr_inst| InternPool.CaptureValue.wrap(capture: {
+            .instruction_load => |ptr_inst| InternPool.CaptureValue.wrap(ip, capture: {
                 const ptr_ref = try sema.resolveInst(ptr_inst.toRef());
                 const ptr_val = try sema.resolveValue(ptr_ref) orelse {
                     break :capture .{ .runtime = sema.typeOf(ptr_ref).childType(zcu).toIntern() };
@@ -2897,7 +2898,7 @@ fn getCaptures(sema: *Sema, block: *Block, type_src: LazySrcLoc, extra_index: us
                 }
                 break :capture .{ .@"comptime" = loaded_val.toIntern() };
             }),
-            .instruction => |inst| InternPool.CaptureValue.wrap(capture: {
+            .instruction => |inst| InternPool.CaptureValue.wrap(ip, capture: {
                 const air_ref = try sema.resolveInst(inst.toRef());
                 if (try sema.resolveValueResolveLazy(air_ref)) |val| {
                     if (val.canMutateComptimeVarState(zcu)) {
@@ -2916,7 +2917,7 @@ fn getCaptures(sema: *Sema, block: *Block, type_src: LazySrcLoc, extra_index: us
                     .no_embedded_nulls,
                 );
                 const nav = try sema.lookupIdentifier(block, decl_name);
-                break :capture InternPool.CaptureValue.wrap(.{ .nav_val = nav });
+                break :capture InternPool.CaptureValue.wrap(ip, .{ .nav_val = nav });
             },
             .decl_ref => |str| capture: {
                 const decl_name = try ip.getOrPutString(
@@ -2926,7 +2927,7 @@ fn getCaptures(sema: *Sema, block: *Block, type_src: LazySrcLoc, extra_index: us
                     .no_embedded_nulls,
                 );
                 const nav = try sema.lookupIdentifier(block, decl_name);
-                break :capture InternPool.CaptureValue.wrap(.{ .nav_ref = nav });
+                break :capture InternPool.CaptureValue.wrap(ip, .{ .nav_ref = nav });
             },
         };
     }
@@ -3000,6 +3001,7 @@ fn zirStructDecl(
     };
     const wip_ty = switch (try ip.getStructType(gpa, pt.tid, struct_init, false)) {
         .existing => |ty| {
+            zcu.awaitNamespaceTypeFinished(ty);
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
@@ -3012,7 +3014,8 @@ fn zirStructDecl(
         },
         .wip => |wip| wip,
     };
-    errdefer wip_ty.cancel(ip, pt.tid);
+    var published = false;
+    errdefer if (!published) wip_ty.cancel(ip, pt.tid);
 
     const type_name = try sema.createTypeName(
         block,
@@ -3027,9 +3030,11 @@ fn zirStructDecl(
         .parent = block.namespace.toOptional(),
         .owner_type = wip_ty.index,
         .file_scope = block.getFileScopeIndex(zcu),
-        .generation = zcu.generation,
+        .generation = zcu.generation -% 1,
     });
-    errdefer pt.destroyNamespace(new_namespace_index);
+    errdefer if (!published) pt.destroyNamespace(new_namespace_index);
+    _ = wip_ty.finish(ip, new_namespace_index);
+    published = true;
 
     if (pt.zcu.comp.incremental) {
         try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = tracked_inst });
@@ -3049,7 +3054,7 @@ fn zirStructDecl(
     try sema.declareDependency(.{ .interned = wip_ty.index });
     try sema.addTypeReferenceEntry(src, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return Air.internedToRef(wip_ty.finish(ip, new_namespace_index));
+    return Air.internedToRef(wip_ty.index);
 }
 
 pub fn createTypeName(
@@ -3241,6 +3246,7 @@ fn zirEnumDecl(
     };
     const wip_ty = switch (try ip.getEnumType(gpa, pt.tid, enum_init, false)) {
         .existing => |ty| {
+            zcu.awaitNamespaceTypeFinished(ty);
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
@@ -3254,7 +3260,7 @@ fn zirEnumDecl(
             // `ensureTypeUpToDate` has resolved the new type if necessary.
             // We just need to check for resolution failures.
             const ty_unit: AnalUnit = .wrap(.{ .type = new_ty });
-            if (zcu.failed_analysis.contains(ty_unit) or zcu.transitive_failed_analysis.contains(ty_unit)) {
+            if (zcu.anyAnalysisFailed(ty_unit)) {
                 return error.AnalysisFail;
             }
 
@@ -3282,7 +3288,7 @@ fn zirEnumDecl(
         .parent = block.namespace.toOptional(),
         .owner_type = wip_ty.index,
         .file_scope = block.getFileScopeIndex(zcu),
-        .generation = zcu.generation,
+        .generation = zcu.generation -% 1,
     });
     errdefer if (!done) pt.destroyNamespace(new_namespace_index);
 
@@ -3294,10 +3300,18 @@ fn zirEnumDecl(
     // We've finished the initial construction of this type, and are about to perform analysis.
     // Set the namespace appropriately, and don't destroy anything on failure.
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    wip_ty.prepare(ip, new_namespace_index);
     done = true;
 
+    // Defer clearing the namespace sentinel until after field names are
+    // populated so a parallel `enumFieldIndex` reader that awaits on it sees
+    // the full names_map. Same-thread recursion (a field value referencing an
+    // earlier field) is unblocked via `tls_wip_types`. On error past
+    // `done=true`, prepare() still publishes the namespace so readers don't
+    // spin forever; the partial enum is reported via `failed_analysis`.
+    zcu.wipTypeEnter(wip_ty.index) catch {};
     {
+        defer zcu.wipTypeExit(wip_ty.index);
+        defer wip_ty.prepare(ip, new_namespace_index);
         const tracked_unit = zcu.trackUnitSema(type_name.name.toSlice(ip), null);
         defer tracked_unit.end(zcu);
         try Sema.resolveDeclaredEnum(
@@ -3398,6 +3412,7 @@ fn zirUnionDecl(
     };
     const wip_ty = switch (try ip.getUnionType(gpa, pt.tid, union_init, false)) {
         .existing => |ty| {
+            zcu.awaitNamespaceTypeFinished(ty);
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
@@ -3410,7 +3425,8 @@ fn zirUnionDecl(
         },
         .wip => |wip| wip,
     };
-    errdefer wip_ty.cancel(ip, pt.tid);
+    var published = false;
+    errdefer if (!published) wip_ty.cancel(ip, pt.tid);
 
     const type_name = try sema.createTypeName(
         block,
@@ -3425,9 +3441,11 @@ fn zirUnionDecl(
         .parent = block.namespace.toOptional(),
         .owner_type = wip_ty.index,
         .file_scope = block.getFileScopeIndex(zcu),
-        .generation = zcu.generation,
+        .generation = zcu.generation -% 1,
     });
-    errdefer pt.destroyNamespace(new_namespace_index);
+    errdefer if (!published) pt.destroyNamespace(new_namespace_index);
+    _ = wip_ty.finish(ip, new_namespace_index);
+    published = true;
 
     if (pt.zcu.comp.incremental) {
         try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = tracked_inst });
@@ -3447,7 +3465,7 @@ fn zirUnionDecl(
     try sema.declareDependency(.{ .interned = wip_ty.index });
     try sema.addTypeReferenceEntry(src, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return Air.internedToRef(wip_ty.finish(ip, new_namespace_index));
+    return Air.internedToRef(wip_ty.index);
 }
 
 fn zirOpaqueDecl(
@@ -3494,6 +3512,7 @@ fn zirOpaqueDecl(
     };
     const wip_ty = switch (try ip.getOpaqueType(gpa, pt.tid, opaque_init)) {
         .existing => |ty| {
+            zcu.awaitNamespaceTypeFinished(ty);
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
             // up on e.g. changed comptime decls.
             try pt.ensureNamespaceUpToDate(Type.fromInterned(ty).getNamespaceIndex(zcu));
@@ -3504,7 +3523,8 @@ fn zirOpaqueDecl(
         },
         .wip => |wip| wip,
     };
-    errdefer wip_ty.cancel(ip, pt.tid);
+    var published = false;
+    errdefer if (!published) wip_ty.cancel(ip, pt.tid);
 
     const type_name = try sema.createTypeName(
         block,
@@ -3519,9 +3539,11 @@ fn zirOpaqueDecl(
         .parent = block.namespace.toOptional(),
         .owner_type = wip_ty.index,
         .file_scope = block.getFileScopeIndex(zcu),
-        .generation = zcu.generation,
+        .generation = zcu.generation -% 1,
     });
-    errdefer pt.destroyNamespace(new_namespace_index);
+    errdefer if (!published) pt.destroyNamespace(new_namespace_index);
+    _ = wip_ty.finish(ip, new_namespace_index);
+    published = true;
 
     const decls = sema.code.bodySlice(extra_index, decls_len);
     try pt.scanNamespace(new_namespace_index, decls);
@@ -3535,7 +3557,7 @@ fn zirOpaqueDecl(
     }
     try sema.addTypeReferenceEntry(src, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return Air.internedToRef(wip_ty.finish(ip, new_namespace_index));
+    return Air.internedToRef(wip_ty.index);
 }
 
 fn zirErrorSetDecl(
@@ -5532,6 +5554,8 @@ fn zirCompileLog(
 
     const line_data = try zcu.intern_pool.getOrPutString(gpa, pt.tid, aw.written(), .no_embedded_nulls);
 
+    zcu.compile_log_mutex.lock();
+    defer zcu.compile_log_mutex.unlock();
     const line_idx: Zcu.CompileLogLine.Index = if (zcu.free_compile_log_lines.pop()) |idx| idx: {
         zcu.compile_log_lines.items[@intFromEnum(idx)] = .{
             .next = .none,
@@ -5752,6 +5776,8 @@ fn zirCImport(sema: *Sema, parent_block: *Block, inst: Zir.Inst.Index) CompileEr
             if (!comp.config.link_libc)
                 try sema.errNote(src, msg, "libc headers not available; compilation does not link against libc", .{});
 
+            zcu.cimport_errors_mutex.lock();
+            defer zcu.cimport_errors_mutex.unlock();
             const gop = try zcu.cimport_errors.getOrPut(gpa, sema.owner);
             if (!gop.found_existing) {
                 gop.value_ptr.* = c_import_res.errors;
@@ -7546,6 +7572,8 @@ fn analyzeCall(
             } else resolved_ret_ty;
 
             // We now need to actually create the function instance.
+            // `getFuncInstanceIes` takes its 4 shard mutexes in sorted order
+            // via `lockShardsSorted`, so concurrent calls cannot ABBA-deadlock.
             const func_instance = try ip.getFuncInstance(gpa, pt.tid, .{
                 .param_types = runtime_param_tys.items,
                 .noalias_bits = noalias_bits,
@@ -7889,12 +7917,25 @@ fn analyzeCall(
     if (block.isComptime()) {
         const result_val = (try sema.resolveValue(maybe_opv)).?;
         if (want_memoize and sema.allow_memoize and !result_val.canMutateComptimeVarState(zcu)) {
-            _ = try pt.intern(.{ .memoized_call = .{
+            const memo_idx = try pt.intern(.{ .memoized_call = .{
                 .func = func_val.?.toIntern(),
                 .arg_values = memoized_arg_values,
                 .result = result_val.toIntern(),
                 .branch_count = sema.branch_count - old_branch_count,
             } });
+            // Under parallel Sema two threads can compute the same comptime
+            // call and observe different intermediate state (e.g. partial
+            // `@typeInfo` fields). The intern key ignores `result`, so the
+            // first thread to publish wins and the loser would otherwise
+            // silently adopt a wrong cached result. Detect the divergence and
+            // retry instead of returning the cached value.
+            if (zcu.parallel_sema) {
+                const cached = ip.indexToKey(memo_idx).memoized_call.result;
+                if (cached != result_val.toIntern()) {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                }
+            }
         }
     }
 
@@ -13839,7 +13880,9 @@ fn zirEmbedFile(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     };
     try sema.declareDependency(.{ .embed_file = ef_idx });
 
-    const result = ef_idx.get(zcu);
+    zcu.semaLock();
+    const result = ef_idx.get(zcu).*;
+    zcu.semaUnlock();
     if (result.val == .none) {
         return sema.fail(block, operand_src, "unable to open '{s}': {s}", .{ name, @errorName(result.err.?) });
     }
@@ -16824,7 +16867,7 @@ fn zirThis(
             // `ensureTypeUpToDate` has resolved the new type if necessary.
             // We just need to check for resolution failures.
             const ty_unit: AnalUnit = .wrap(.{ .type = new_ty });
-            if (zcu.failed_analysis.contains(ty_unit) or zcu.transitive_failed_analysis.contains(ty_unit)) {
+            if (zcu.anyAnalysisFailed(ty_unit)) {
                 return error.AnalysisFail;
             }
             return Air.internedToRef(new_ty);
@@ -16842,7 +16885,7 @@ fn zirClosureGet(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstDat
     const src_node: std.zig.Ast.Node.Offset = @enumFromInt(@as(i32, @bitCast(extended.operand)));
     const src = block.nodeOffset(src_node);
 
-    const capture_ty = switch (captures.get(ip)[extended.small].unwrap()) {
+    const capture_ty = switch (captures.get(ip)[extended.small].unwrap(ip)) {
         .@"comptime" => |index| return Air.internedToRef(index),
         .runtime => |index| index,
         .nav_val => |nav| return sema.analyzeNavVal(block, src, nav),
@@ -17673,6 +17716,50 @@ fn zirTypeInfo(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Ai
 
                 try ty.resolveStructFieldInits(pt);
 
+                // Under parallel Sema another thread may have left names/types
+                // partially populated when our `haveFieldInits`/`haveFieldTypes`
+                // fast-path observed a stale "done" flag. A wrong `@typeInfo`
+                // result here is permanently memoized via the comptime-call
+                // cache, so guard: if any slot is `.none`, retry the outer job.
+                if (zcu.parallel_sema) for (0..struct_type.field_types.len) |gi| {
+                    if (struct_type.fieldName(ip, gi) == .none or
+                        struct_type.field_types.get(ip)[gi] == .none)
+                    {
+                        Zcu.tls_retry_loop = .wrap(.{ .type = ty.toIntern() });
+                        return error.AnalysisFail;
+                    }
+                };
+
+                if (zcu.parallel_sema and std.process.hasNonEmptyEnvVarConstant("ZIG_TRACE_TYPEINFO")) diag: {
+                    var any_none_name: bool = false;
+                    var any_none_ty: bool = false;
+                    const n = struct_type.field_types.len;
+                    var idx: u32 = 0;
+                    while (idx < n) : (idx += 1) {
+                        if (struct_type.fieldName(ip, idx) == .none) any_none_name = true;
+                        if (struct_type.field_types.get(ip)[idx] == .none) any_none_ty = true;
+                    }
+                    if (!any_none_name and !any_none_ty) break :diag;
+                    std.debug.print(
+                        "TYPEINFO-RACE struct={f} fields_len={d} haveFieldTypes={} haveFieldInits={} names_map_count={d}\n",
+                        .{
+                            struct_type.name.fmt(ip),
+                            n,
+                            struct_type.haveFieldTypes(ip),
+                            struct_type.haveFieldInits(ip),
+                            if (struct_type.names_map.unwrap()) |m| m.get(ip).count() else 0,
+                        },
+                    );
+                    idx = 0;
+                    while (idx < n) : (idx += 1) {
+                        const nm = struct_type.fieldName(ip, idx);
+                        const ft = struct_type.field_types.get(ip)[idx];
+                        if (nm == .none or ft == .none)
+                            std.debug.print("  field[{d}] name={} type={}\n", .{ idx, nm, ft });
+                    }
+                    std.debug.dumpCurrentStackTrace(null);
+                }
+
                 for (struct_field_vals, 0..) |*field_val, field_index| {
                     const field_name = if (struct_type.fieldName(ip, field_index).unwrap()) |field_name|
                         field_name
@@ -19784,9 +19871,9 @@ fn structInitAnon(
                 if (init_val != .none) struct_type.setFieldComptime(ip, field_idx);
             }
 
-            @memcpy(struct_type.field_types.get(ip), types);
+            struct_type.setFieldTypesAll(ip, types);
             if (any_values) {
-                @memcpy(struct_type.field_inits.get(ip), values);
+                struct_type.setFieldInitsAll(ip, values);
             }
 
             const new_namespace_index = try pt.createNamespace(.{
@@ -19795,6 +19882,7 @@ fn structInitAnon(
                 .file_scope = block.getFileScopeIndex(zcu),
                 .generation = zcu.generation,
             });
+            _ = wip.finish(ip, new_namespace_index);
             try zcu.comp.queueJob(.{ .resolve_type_fully = wip.index });
             codegen_type: {
                 if (zcu.comp.config.use_llvm) break :codegen_type;
@@ -19803,10 +19891,14 @@ fn structInitAnon(
                 try zcu.comp.queueJob(.{ .link_type = wip.index });
             }
             if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip.index);
-            break :ty wip.finish(ip, new_namespace_index);
+            break :ty wip.index;
         },
         .existing => |ty| ty,
     };
+    // Under parallel Sema, `.existing` may dedup to a type whose `.wip` owner
+    // has not yet run `setFieldTypesAll`/`finish`; spin so `aggregateValue`'s
+    // canonicalization sees populated field_types.
+    zcu.awaitNamespaceTypeFinished(struct_ty);
     try sema.declareDependency(.{ .interned = struct_ty });
     try sema.addTypeReferenceEntry(src, struct_ty);
 
@@ -20805,7 +20897,8 @@ fn zirReify(
                 },
                 .wip => |wip| wip,
             };
-            errdefer wip_ty.cancel(ip, pt.tid);
+            var published = false;
+            errdefer if (!published) wip_ty.cancel(ip, pt.tid);
 
             const type_name = try sema.createTypeName(
                 block,
@@ -20822,10 +20915,12 @@ fn zirReify(
                 .file_scope = block.getFileScopeIndex(zcu),
                 .generation = zcu.generation,
             });
+            _ = wip_ty.finish(ip, new_namespace_index);
+            published = true;
 
             try sema.addTypeReferenceEntry(src, wip_ty.index);
             if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-            return Air.internedToRef(wip_ty.finish(ip, new_namespace_index));
+            return Air.internedToRef(wip_ty.index);
         },
         .@"union" => {
             const struct_type = ip.loadStructType(ip.typeOf(union_val.val));
@@ -21042,10 +21137,13 @@ fn reifyEnum(
     try sema.declareDependency(.{ .interned = wip_ty.index });
     try sema.addTypeReferenceEntry(src, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    wip_ty.prepare(ip, new_namespace_index);
     wip_ty.setTagTy(ip, tag_ty.toIntern());
     done = true;
 
+    zcu.wipTypeEnter(wip_ty.index) catch {};
+    defer zcu.wipTypeExit(wip_ty.index);
+    defer wip_ty.prepare(ip, new_namespace_index);
+
     for (0..fields_len) |field_idx| {
         const field_info = try fields_val.elemValue(pt, field_idx);
 
@@ -21180,7 +21278,8 @@ fn reifyUnion(
             return Air.internedToRef(ty);
         },
     };
-    errdefer wip_ty.cancel(ip, pt.tid);
+    var published = false;
+    errdefer if (!published) wip_ty.cancel(ip, pt.tid);
 
     const type_name = try sema.createTypeName(
         block,
@@ -21226,12 +21325,12 @@ fn reifyUnion(
             }
             seen_tags.set(enum_index);
 
-            loaded_union.field_types.get(ip)[field_idx] = field_type_val.toIntern();
+            loaded_union.setFieldType(ip, field_idx, field_type_val.toIntern());
             const byte_align = try field_alignment_val.toUnsignedIntSema(pt);
             if (layout == .@"packed") {
                 if (byte_align != 0) return sema.fail(block, src, "alignment of a packed union field must be set to 0", .{});
             } else {
-                loaded_union.field_aligns.get(ip)[field_idx] = try sema.validateAlign(block, src, byte_align);
+                loaded_union.setFieldAlign(ip, field_idx, try sema.validateAlign(block, src, byte_align));
             }
         }
 
@@ -21270,12 +21369,12 @@ fn reifyUnion(
                 return sema.fail(block, src, "duplicate union field {f}", .{field_name.fmt(ip)});
             }
 
-            loaded_union.field_types.get(ip)[field_idx] = field_type_val.toIntern();
+            loaded_union.setFieldType(ip, field_idx, field_type_val.toIntern());
             const byte_align = try field_alignment_val.toUnsignedIntSema(pt);
             if (layout == .@"packed") {
                 if (byte_align != 0) return sema.fail(block, src, "alignment of a packed union field must be set to 0", .{});
             } else {
-                loaded_union.field_aligns.get(ip)[field_idx] = try sema.validateAlign(block, src, byte_align);
+                loaded_union.setFieldAlign(ip, field_idx, try sema.validateAlign(block, src, byte_align));
             }
         }
 
@@ -21327,6 +21426,8 @@ fn reifyUnion(
         .file_scope = block.getFileScopeIndex(zcu),
         .generation = zcu.generation,
     });
+    _ = wip_ty.finish(ip, new_namespace_index);
+    published = true;
 
     try zcu.comp.queueJob(.{ .resolve_type_fully = wip_ty.index });
     codegen_type: {
@@ -21339,7 +21440,7 @@ fn reifyUnion(
     try sema.declareDependency(.{ .interned = wip_ty.index });
     try sema.addTypeReferenceEntry(src, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return Air.internedToRef(wip_ty.finish(ip, new_namespace_index));
+    return Air.internedToRef(wip_ty.index);
 }
 
 fn reifyTuple(
@@ -21529,7 +21630,8 @@ fn reifyStruct(
             return Air.internedToRef(ty);
         },
     };
-    errdefer wip_ty.cancel(ip, pt.tid);
+    var published = false;
+    errdefer if (!published) wip_ty.cancel(ip, pt.tid);
 
     const type_name = try sema.createTypeName(
         block,
@@ -21566,7 +21668,7 @@ fn reifyStruct(
         if (layout == .@"packed") {
             if (byte_align != 0) return sema.fail(block, src, "alignment of a packed struct field must be set to 0", .{});
         } else {
-            struct_type.field_aligns.get(ip)[field_idx] = try sema.validateAlign(block, src, byte_align);
+            struct_type.setFieldAlign(ip, field_idx, try sema.validateAlign(block, src, byte_align));
         }
 
         const field_is_comptime = field_is_comptime_val.toBool();
@@ -21593,9 +21695,9 @@ fn reifyStruct(
             return sema.fail(block, src, "comptime field without default initialization value", .{});
         }
 
-        struct_type.field_types.get(ip)[field_idx] = field_type_val.toIntern();
+        struct_type.setFieldType(ip, field_idx, field_type_val.toIntern());
         if (field_default != .none) {
-            struct_type.field_inits.get(ip)[field_idx] = field_default;
+            struct_type.setFieldInit(ip, field_idx, field_default);
         }
 
         if (field_ty.zigTypeTag(zcu) == .@"opaque") {
@@ -21670,6 +21772,8 @@ fn reifyStruct(
         .file_scope = block.getFileScopeIndex(zcu),
         .generation = zcu.generation,
     });
+    _ = wip_ty.finish(ip, new_namespace_index);
+    published = true;
 
     try zcu.comp.queueJob(.{ .resolve_type_fully = wip_ty.index });
     codegen_type: {
@@ -21682,7 +21786,7 @@ fn reifyStruct(
     try sema.declareDependency(.{ .interned = wip_ty.index });
     try sema.addTypeReferenceEntry(src, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return Air.internedToRef(wip_ty.finish(ip, new_namespace_index));
+    return Air.internedToRef(wip_ty.index);
 }
 
 fn resolveVaListRef(sema: *Sema, block: *Block, src: LazySrcLoc, zir_ref: Zir.Inst.Ref) CompileError!Air.Inst.Ref {
@@ -31219,12 +31323,15 @@ fn addReferenceEntry(
     if (!zcu.comp.incremental and zcu.comp.reference_trace == 0) return;
     const gop = try sema.references.getOrPut(sema.gpa, referenced_unit);
     if (gop.found_existing) return;
-    try zcu.addUnitReference(sema.owner, referenced_unit, src, inline_frame: {
-        const block = opt_block orelse break :inline_frame .none;
-        const inlining = block.inlining orelse break :inline_frame .none;
-        const frame = try inlining.refFrame(zcu);
-        break :inline_frame frame.toOptional();
-    });
+    gop.value_ptr.* = .{
+        .src = src,
+        .inline_frame = inline_frame: {
+            const block = opt_block orelse break :inline_frame .none;
+            const inlining = block.inlining orelse break :inline_frame .none;
+            const frame = try inlining.refFrame(zcu);
+            break :inline_frame frame.toOptional();
+        },
+    };
 }
 
 pub fn addTypeReferenceEntry(
@@ -31236,7 +31343,7 @@ pub fn addTypeReferenceEntry(
     if (!zcu.comp.incremental and zcu.comp.reference_trace == 0) return;
     const gop = try sema.type_references.getOrPut(sema.gpa, referenced_type);
     if (gop.found_existing) return;
-    try zcu.addTypeReference(sema.owner, referenced_type, src);
+    gop.value_ptr.* = src;
 }
 
 fn ensureMemoizedStateResolved(sema: *Sema, src: LazySrcLoc, stage: InternPool.MemoizedStateStage) SemaError!void {
@@ -31246,12 +31353,34 @@ fn ensureMemoizedStateResolved(sema: *Sema, src: LazySrcLoc, stage: InternPool.M
     try sema.addReferenceEntry(null, src, unit);
     try sema.declareDependency(.{ .memoized_state = stage });
 
-    if (pt.zcu.analysis_in_progress.contains(unit)) {
+    if (pt.zcu.semaAipContains(unit)) {
         return sema.failWithOwnedErrorMsg(null, try sema.errMsg(src, "dependency loop detected", .{}));
     }
     try pt.ensureMemoizedStateUpToDate(stage);
 }
 
+/// Under parallel Sema, a wip-flag self-dependency may be order-dependent
+/// (an intermediate nav would have short-circuited the chain in serial).
+/// Returns true if the caller should set `tls_retry_loop` and propagate
+/// AnalysisFail so the outermost job can release-and-requeue.
+fn maybeRetryTypeLoop(sema: *Sema, ty: Type) Allocator.Error!bool {
+    const zcu = sema.pt.zcu;
+    const unit: AnalUnit = .wrap(.{ .type = ty.toIntern() });
+    zcu.sema_retry_mutex.lock();
+    defer zcu.sema_retry_mutex.unlock();
+    const gop = try zcu.sema_retry_counts.getOrPut(zcu.gpa, unit);
+    if (!gop.found_existing) gop.value_ptr.* = 0;
+    gop.value_ptr.* +|= 1;
+    // Cap scaled to thread count: with the ensure*UpToDate sema_lock gate
+    // dropped, far more workers can hit the same wip-flag concurrently and
+    // each bumps this counter; 8 was tuned for the serialised path.
+    if (gop.value_ptr.* < 128) {
+        Zcu.tls_retry_loop = unit;
+        return true;
+    }
+    return false;
+}
+
 pub fn ensureNavResolved(sema: *Sema, block: *Block, src: LazySrcLoc, nav_index: InternPool.Nav.Index, kind: enum { type, fully }) CompileError!void {
     const pt = sema.pt;
     const zcu = pt.zcu;
@@ -31277,7 +31406,24 @@ pub fn ensureNavResolved(sema: *Sema, block: *Block, src: LazySrcLoc, nav_index:
     });
     try sema.addReferenceEntry(block, src, anal_unit);
 
-    if (zcu.analysis_in_progress.contains(anal_unit)) {
+    if (zcu.semaAipContains(anal_unit)) {
+        if (zcu.parallel_sema) {
+            // The loop may be order-dependent: another thread could resolve
+            // an intermediate nav and break the chain. Signal the outermost
+            // analyze_func to release-and-requeue instead of marking failed.
+            zcu.sema_retry_mutex.lock();
+            const tries: u8 = blk: {
+                const gop = zcu.sema_retry_counts.getOrPut(zcu.gpa, anal_unit) catch break :blk 255;
+                if (!gop.found_existing) gop.value_ptr.* = 0;
+                gop.value_ptr.* +|= 1;
+                break :blk gop.value_ptr.*;
+            };
+            zcu.sema_retry_mutex.unlock();
+            if (tries < 128) {
+                Zcu.tls_retry_loop = anal_unit;
+                return error.AnalysisFail;
+            }
+        }
         return sema.failWithOwnedErrorMsg(null, try sema.errMsg(.{
             .base_node_inst = nav.analysis.?.zir_index,
             .offset = LazySrcLoc.Offset.nodeOffset(.zero),
@@ -31317,7 +31463,12 @@ fn analyzeNavRefInner(sema: *Sema, block: *Block, src: LazySrcLoc, orig_nav_inde
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
 
-    try sema.ensureNavResolved(block, src, orig_nav_index, if (is_ref) .type else .fully);
+    // Under parallel Sema another thread may transition the nav from
+    // .type_resolved → .fully_resolved between our ensureNavResolved and
+    // the getNav below, leaving a torn read in `isExternOrFn`. Fully
+    // resolving here serialises via claimOrWait so the subsequent getNav
+    // observes a stable status.
+    try sema.ensureNavResolved(block, src, orig_nav_index, if (is_ref and !zcu.parallel_sema) .type else .fully);
 
     const nav_index = nav: {
         if (ip.getNav(orig_nav_index).isExternOrFn(ip)) {
@@ -31396,7 +31547,21 @@ fn maybeQueueFuncBodyAnalysis(sema: *Sema, block: *Block, src: LazySrcLoc, nav_i
     try sema.ensureNavResolved(block, src, nav_index, .type);
     const nav_ty: Type = .fromInterned(ip.getNav(nav_index).typeOf(ip));
     if (nav_ty.zigTypeTag(zcu) != .@"fn") return;
-    if (!try nav_ty.fnHasRuntimeBitsSema(pt)) return;
+    // `fnHasRuntimeBitsSema` may trigger a yield-and-requeue retry; if so,
+    // the caller has already committed (export registered / pointer taken),
+    // so swallow it and conservatively queue analysis rather than dropping
+    // the func body and crashing in `processExports`.
+    const has_rt_bits = nav_ty.fnHasRuntimeBitsSema(pt) catch |err| switch (err) {
+        error.OutOfMemory => return error.OutOfMemory,
+        error.AnalysisFail => blk: {
+            if (Zcu.tls_retry_loop != null) {
+                Zcu.tls_retry_loop = null;
+                break :blk true;
+            }
+            return error.AnalysisFail;
+        },
+    };
+    if (!has_rt_bits) return;
 
     try sema.ensureNavResolved(block, src, nav_index, .fully);
     const nav_val = zcu.navValue(nav_index);
@@ -34315,7 +34480,9 @@ pub fn resolveStructAlignment(
     assert(sema.owner.unwrap().type == ty);
 
     assert(struct_type.layout != .@"packed");
-    assert(struct_type.flagsUnordered(ip).alignment == .none);
+    // The unlocked caller-side check is a TOCTOU under parallel Sema; another
+    // thread may have completed this resolution while we waited on sema_lock.
+    if (struct_type.flagsUnordered(ip).alignment != .none) return;
 
     const ptr_align = Alignment.fromByteUnits(@divExact(target.ptrBitWidth(), 8));
 
@@ -34377,6 +34544,7 @@ pub fn resolveStructLayout(sema: *Sema, ty: Type) SemaError!void {
     }
 
     if (struct_type.setLayoutWip(ip)) {
+        if (zcu.parallel_sema) if (try sema.maybeRetryTypeLoop(ty)) return error.AnalysisFail;
         const msg = try sema.errMsg(
             ty.srcLoc(zcu),
             "struct '{f}' depends on itself",
@@ -34394,7 +34562,7 @@ pub fn resolveStructLayout(sema: *Sema, ty: Type) SemaError!void {
     for (aligns, sizes, 0..) |*field_align, *field_size, i| {
         const field_ty: Type = .fromInterned(struct_type.field_types.get(ip)[i]);
         if (struct_type.fieldIsComptime(ip, i) or try field_ty.comptimeOnlySema(pt)) {
-            struct_type.offsets.get(ip)[i] = 0;
+            struct_type.setOffset(ip, @intCast(i), 0);
             field_size.* = 0;
             field_align.* = .none;
             continue;
@@ -34437,7 +34605,10 @@ pub fn resolveStructLayout(sema: *Sema, ty: Type) SemaError!void {
     }
 
     if (struct_type.hasReorderedFields()) {
-        const runtime_order = struct_type.runtime_order.get(ip);
+        const RuntimeOrder = InternPool.LoadedStructType.RuntimeOrder;
+        // Compute into a temp to avoid cross-tid writes to InternPool extra
+        // without `extra.mutex` while calling back into Sema.
+        const runtime_order = try sema.arena.alloc(RuntimeOrder, struct_type.field_types.len);
 
         for (runtime_order, 0..) |*ro, i| {
             const field_ty: Type = .fromInterned(struct_type.field_types.get(ip)[i]);
@@ -34448,8 +34619,6 @@ pub fn resolveStructLayout(sema: *Sema, ty: Type) SemaError!void {
             }
         }
 
-        const RuntimeOrder = InternPool.LoadedStructType.RuntimeOrder;
-
         const AlignSortContext = struct {
             aligns: []const Alignment,
 
@@ -34485,16 +34654,20 @@ pub fn resolveStructLayout(sema: *Sema, ty: Type) SemaError!void {
                 .aligns = aligns,
             }, AlignSortContext.lessThan);
         }
+        struct_type.setRuntimeOrderAll(ip, runtime_order);
     }
 
-    // Calculate size, alignment, and field offsets.
-    const offsets = struct_type.offsets.get(ip);
+    // Calculate size, alignment, and field offsets in a temp, then publish
+    // under `extra.mutex` so a foreign-tid realloc cannot lose our writes.
+    const offsets = try sema.arena.alloc(u32, struct_type.field_types.len);
+    @memcpy(offsets, struct_type.offsets.get(ip));
     var it = struct_type.iterateRuntimeOrder(ip);
     var offset: u64 = 0;
     while (it.next()) |i| {
         offsets[i] = @intCast(aligns[i].forward(offset));
         offset = offsets[i] + sizes[i];
     }
+    struct_type.setOffsetsAll(ip, offsets);
     const size = std.math.cast(u32, big_align.forward(offset)) orelse {
         const msg = try sema.errMsg(
             ty.srcLoc(zcu),
@@ -34504,7 +34677,15 @@ pub fn resolveStructLayout(sema: *Sema, ty: Type) SemaError!void {
         return sema.failWithOwnedErrorMsg(null, msg);
     };
     struct_type.setLayoutResolved(ip, size, big_align);
-    _ = try ty.comptimeOnlySema(pt);
+    _ = ty.comptimeOnlySema(pt) catch |err| switch (err) {
+        error.OutOfMemory => return error.OutOfMemory,
+        error.AnalysisFail => {
+            if (Zcu.tls_retry_loop != null) {
+                // layout already committed; re-queue would early-return.
+                Zcu.tls_retry_loop = null;
+            } else return error.AnalysisFail;
+        },
+    };
 }
 
 fn backingIntType(
@@ -34660,7 +34841,7 @@ pub fn resolveUnionAlignment(
 
     assert(sema.owner.unwrap().type == ty.toIntern());
 
-    assert(!union_type.haveLayout(ip));
+    if (union_type.haveLayout(ip)) return;
 
     const ptr_align = Alignment.fromByteUnits(@divExact(target.ptrBitWidth(), 8));
 
@@ -34707,6 +34888,7 @@ pub fn resolveUnionLayout(sema: *Sema, ty: Type) SemaError!void {
     switch (old_flags.status) {
         .none, .have_field_types => {},
         .field_types_wip, .layout_wip => {
+            if (pt.zcu.parallel_sema) if (try sema.maybeRetryTypeLoop(ty)) return error.AnalysisFail;
             const msg = try sema.errMsg(
                 ty.srcLoc(pt.zcu),
                 "union '{f}' depends on itself",
@@ -34802,7 +34984,17 @@ pub fn resolveUnionLayout(sema: *Sema, ty: Type) SemaError!void {
     };
     union_type.setHaveLayout(ip, casted_size, padding, alignment);
 
-    if (union_type.flagsUnordered(ip).assumed_runtime_bits and !(try ty.hasRuntimeBitsSema(pt))) {
+    const has_rt_bits = ty.hasRuntimeBitsSema(pt) catch |err| switch (err) {
+        error.OutOfMemory => return error.OutOfMemory,
+        error.AnalysisFail => blk: {
+            if (Zcu.tls_retry_loop != null) {
+                Zcu.tls_retry_loop = null;
+                break :blk true;
+            }
+            return error.AnalysisFail;
+        },
+    };
+    if (union_type.flagsUnordered(ip).assumed_runtime_bits and !has_rt_bits) {
         const msg = try sema.errMsg(
             ty.srcLoc(pt.zcu),
             "union layout depends on it having runtime bits",
@@ -34821,7 +35013,14 @@ pub fn resolveUnionLayout(sema: *Sema, ty: Type) SemaError!void {
         );
         return sema.failWithOwnedErrorMsg(null, msg);
     }
-    _ = try ty.comptimeOnlySema(pt);
+    _ = ty.comptimeOnlySema(pt) catch |err| switch (err) {
+        error.OutOfMemory => return error.OutOfMemory,
+        error.AnalysisFail => {
+            if (Zcu.tls_retry_loop != null) {
+                Zcu.tls_retry_loop = null;
+            } else return error.AnalysisFail;
+        },
+    };
 }
 
 /// Returns `error.AnalysisFail` if any of the types (recursively) failed to
@@ -34904,6 +35103,7 @@ pub fn resolveStructFieldTypes(
     if (struct_type.haveFieldTypes(ip)) return;
 
     if (struct_type.setFieldTypesWip(ip)) {
+        if (zcu.parallel_sema) if (try sema.maybeRetryTypeLoop(Type.fromInterned(ty))) return error.AnalysisFail;
         const msg = try sema.errMsg(
             Type.fromInterned(ty).srcLoc(zcu),
             "struct '{f}' depends on itself",
@@ -34937,6 +35137,7 @@ pub fn resolveStructFieldInits(sema: *Sema, ty: Type) SemaError!void {
     try sema.resolveStructLayout(ty);
 
     if (struct_type.setInitsWip(ip)) {
+        if (zcu.parallel_sema) if (try sema.maybeRetryTypeLoop(ty)) return error.AnalysisFail;
         const msg = try sema.errMsg(
             ty.srcLoc(zcu),
             "struct '{f}' depends on itself",
@@ -34967,6 +35168,7 @@ pub fn resolveUnionFieldTypes(sema: *Sema, ty: Type, union_type: InternPool.Load
     switch (union_type.flagsUnordered(ip).status) {
         .none => {},
         .field_types_wip => {
+            if (zcu.parallel_sema) if (try sema.maybeRetryTypeLoop(ty)) return error.AnalysisFail;
             const msg = try sema.errMsg(ty.srcLoc(zcu), "union '{f}' depends on itself", .{ty.fmt(pt)});
             return sema.failWithOwnedErrorMsg(null, msg);
         },
@@ -35011,7 +35213,7 @@ fn resolveInferredErrorSet(
     const resolved_ty = func.resolvedErrorSetUnordered(ip);
     if (resolved_ty != .none) return resolved_ty;
 
-    if (zcu.analysis_in_progress.contains(.wrap(.{ .func = func_index }))) {
+    if (zcu.semaAipContains(.wrap(.{ .func = func_index }))) {
         return sema.fail(block, src, "unable to resolve inferred error set", .{});
     }
 
@@ -35281,7 +35483,8 @@ fn structFields(
 
             // This string needs to outlive the ZIR code.
             const field_name = try ip.getOrPutString(gpa, pt.tid, field_name_zir, .no_embedded_nulls);
-            assert(struct_type.addFieldName(ip, field_name) == null);
+            // A previous yield-and-requeue may have populated some names already.
+            if (struct_type.addFieldName(ip, field_name)) |existing| assert(existing == field_i);
 
             if (has_align) {
                 fields[field_i].align_body_len = zir.extra[extra_index];
@@ -35299,6 +35502,13 @@ fn structFields(
     // Next we do only types and alignments, saving the inits for a second pass,
     // so that init values may depend on type layout.
 
+    // Collect into arena temps and batch-publish under one `extra.mutex` hold
+    // at the end so each field doesn't take the per-tid mutex separately.
+    const tmp_types = try sema.arena.alloc(InternPool.Index, fields.len);
+    @memset(tmp_types, .none);
+    const tmp_aligns = try sema.arena.alloc(InternPool.Alignment, fields.len);
+    @memset(tmp_aligns, .none);
+
     for (fields, 0..) |zir_field, field_i| {
         const ty_src: LazySrcLoc = .{
             .base_node_inst = struct_type.zir_index,
@@ -35315,7 +35525,7 @@ fn structFields(
             break :ty try sema.analyzeAsType(&block_scope, ty_src, ty_ref);
         };
 
-        struct_type.field_types.get(ip)[field_i] = field_ty.toIntern();
+        tmp_types[field_i] = field_ty.toIntern();
 
         if (field_ty.zigTypeTag(zcu) == .@"opaque") {
             const msg = msg: {
@@ -35374,12 +35584,21 @@ fn structFields(
                 .offset = .{ .container_field_align = @intCast(field_i) },
             };
             const field_align = try sema.analyzeAsAlign(&block_scope, align_src, align_ref);
-            struct_type.field_aligns.get(ip)[field_i] = field_align;
+            tmp_aligns[field_i] = field_align;
         }
 
         extra_index += zir_field.init_body_len;
     }
 
+    struct_type.setFieldTypesAlignsAll(ip, tmp_types, if (any_aligned) tmp_aligns else null);
+
+    // Re-store the last field type with release so the unlocked
+    // `haveFieldTypes` fast-path acquire sees all preceding name/type
+    // slot writes from this loop.
+    if (struct_type.field_types.len > 0) {
+        const types = struct_type.field_types.get(ip);
+        @atomicStore(InternPool.Index, &types[types.len - 1], types[types.len - 1], .release);
+    }
     struct_type.clearFieldTypesWip(ip);
     if (!any_inits) struct_type.setHaveFieldInits(ip);
 
@@ -35395,7 +35614,7 @@ fn structFieldInits(
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
 
-    assert(!struct_type.haveFieldInits(ip));
+    if (struct_type.haveFieldInits(ip)) return;
 
     const namespace_index = struct_type.namespace;
     const zir = zcu.namespacePtr(namespace_index).fileScope(zcu).zir.?;
@@ -35464,6 +35683,8 @@ fn structFieldInits(
     }
 
     if (any_inits) {
+        const tmp_inits = try sema.arena.alloc(InternPool.Index, fields.len);
+        @memset(tmp_inits, .none);
         for (fields, 0..) |zir_field, field_i| {
             extra_index += zir_field.type_body_len;
             extra_index += zir_field.align_body_len;
@@ -35498,8 +35719,9 @@ fn structFieldInits(
                 const field_name = struct_type.fieldName(ip, field_i).unwrap().?;
                 return sema.failWithContainsReferenceToComptimeVar(&block_scope, init_src, field_name, "field default value", default_val);
             }
-            struct_type.field_inits.get(ip)[field_i] = default_val.toIntern();
+            tmp_inits[field_i] = default_val.toIntern();
         }
+        struct_type.setFieldInitsAll(ip, tmp_inits);
     }
 
     try sema.flushExports();
@@ -37204,22 +37426,54 @@ fn analyzeUnreachable(sema: *Sema, block: *Block, src: LazySrcLoc, safety_check:
 /// It takes the exports stored in `sema.export` and flushes them to the `Zcu`
 /// to be processed by the linker after the update.
 pub fn flushExports(sema: *Sema) !void {
-    if (sema.exports.items.len == 0) return;
-
     const zcu = sema.pt.zcu;
     const gpa = zcu.gpa;
 
+    if (sema.exports.items.len == 0 and
+        sema.references.count() == 0 and
+        sema.type_references.count() == 0) return;
+
+    {
+        var it = sema.references.iterator();
+        while (it.next()) |e|
+            try zcu.addUnitReference(sema.owner, e.key_ptr.*, e.value_ptr.src, e.value_ptr.inline_frame);
+        sema.references.clearRetainingCapacity();
+    }
+    {
+        var it = sema.type_references.iterator();
+        while (it.next()) |e|
+            try zcu.addTypeReference(sema.owner, e.key_ptr.*, e.value_ptr.*);
+        sema.type_references.clearRetainingCapacity();
+    }
+
+    if (sema.exports.items.len == 0) return;
+
+    zcu.exports_mutex.lock();
+    defer zcu.exports_mutex.unlock();
+
     // There may be existing exports. For instance, a struct may export
     // things during both field type resolution and field default resolution.
     //
     // So, pick up and delete any existing exports. This strategy performs
     // redundant work, but that's okay, because this case is exceedingly rare.
+    // Skip any existing entry that is identical to one already in
+    // `sema.exports` (re-analysis under parallel-Sema retry can re-flush the
+    // same export, which would otherwise duplicate it and trip the symbol
+    // collision check).
+    const new_len = sema.exports.items.len;
     if (zcu.single_exports.get(sema.owner)) |export_idx| {
-        try sema.exports.append(gpa, export_idx.ptr(zcu).*);
+        const e = export_idx.ptr(zcu).*;
+        for (sema.exports.items[0..new_len]) |n| {
+            if (std.meta.eql(n.exported, e.exported) and n.opts.name == e.opts.name) break;
+        } else try sema.exports.append(gpa, e);
     } else if (zcu.multi_exports.get(sema.owner)) |info| {
-        try sema.exports.appendSlice(gpa, zcu.all_exports.items[info.index..][0..info.len]);
+        for (zcu.all_exports.items[info.index..][0..info.len]) |e| {
+            for (sema.exports.items[0..new_len]) |n| {
+                if (std.meta.eql(n.exported, e.exported) and n.opts.name == e.opts.name) break;
+            } else try sema.exports.append(gpa, e);
+        }
     }
-    zcu.deleteUnitExports(sema.owner);
+    zcu.deleteUnitExportsAssumeLocked(sema.owner);
 
     // `sema.exports` is completed; store the data into the `Zcu`.
     if (sema.exports.items.len == 1) {
@@ -37239,6 +37493,7 @@ pub fn flushExports(sema: *Sema) !void {
             .len = @intCast(sema.exports.items.len),
         });
     }
+    sema.exports.clearRetainingCapacity();
 }
 
 /// Called as soon as a `declared` enum type is created.
@@ -37324,6 +37579,8 @@ pub fn resolveDeclaredEnum(
         error.ComptimeReturn => unreachable,
         error.OutOfMemory => |e| return e,
         error.AnalysisFail => {
+            zcu.failed_analysis_mutex.lock();
+            defer zcu.failed_analysis_mutex.unlock();
             if (!zcu.failed_analysis.contains(sema.owner)) {
                 try zcu.transitive_failed_analysis.put(gpa, sema.owner, {});
             }
@@ -37610,7 +37867,10 @@ pub fn analyzeMemoizedState(sema: *Sema, block: *Block, simple_src: LazySrcLoc,
 
             const prev = zcu.builtin_decl_values.get(builtin_decl);
             if (val.toIntern() != prev) {
-                zcu.builtin_decl_values.set(builtin_decl, val.toIntern());
+                // Release-store so that once the final entry of a stage becomes
+                // visible to the acquire fast-path in `PerThread.ensureMemoizedStateUpToDate`,
+                // all prior entries for that stage are visible too.
+                @atomicStore(InternPool.Index, zcu.builtin_decl_values.getPtr(builtin_decl), val.toIntern(), .release);
                 any_changed = true;
             }
         }
diff --git a/src/Type.zig b/src/Type.zig
index 3ef95b285c72..05386f95f885 100644
--- a/src/Type.zig
+++ b/src/Type.zig
@@ -443,7 +443,10 @@ pub fn hasRuntimeBits(ty: Type, zcu: *const Zcu) bool {
 
 pub fn hasRuntimeBitsSema(ty: Type, pt: Zcu.PerThread) SemaError!bool {
     return hasRuntimeBitsInner(ty, false, .sema, pt.zcu, pt.tid) catch |err| switch (err) {
-        error.NeedLazy => unreachable, // this would require a resolve strat of lazy
+        // .sema strat cannot return NeedLazy, but under parallel-Sema races a
+        // partially-populated field type can confuse the inner switches. Treat
+        // it as AnalysisFail so the retry mechanism handles it instead of UB.
+        error.NeedLazy => if (pt.zcu.parallel_sema) error.AnalysisFail else unreachable,
         else => |e| return e,
     };
 }
@@ -454,7 +457,7 @@ pub fn hasRuntimeBitsIgnoreComptime(ty: Type, zcu: *const Zcu) bool {
 
 pub fn hasRuntimeBitsIgnoreComptimeSema(ty: Type, pt: Zcu.PerThread) SemaError!bool {
     return hasRuntimeBitsInner(ty, true, .sema, pt.zcu, pt.tid) catch |err| switch (err) {
-        error.NeedLazy => unreachable, // this would require a resolve strat of lazy
+        error.NeedLazy => if (pt.zcu.parallel_sema) error.AnalysisFail else unreachable,
         else => |e| return e,
     };
 }
@@ -1188,7 +1191,7 @@ fn abiAlignmentInnerErrorUnion(
                         .ty = .comptime_int_type,
                         .storage = .{ .lazy_align = ty.toIntern() },
                     } })) };
-                } else unreachable,
+                } else return error.AnalysisFail,
                 else => |e| return e,
             })) {
                 return .{ .scalar = code_align };
@@ -1236,7 +1239,7 @@ fn abiAlignmentInnerOptional(
                         .ty = .comptime_int_type,
                         .storage = .{ .lazy_align = ty.toIntern() },
                     } })) };
-                } else unreachable,
+                } else return error.AnalysisFail,
                 else => |e| return e,
             })) {
                 return .{ .scalar = .@"1" };
@@ -1793,6 +1796,37 @@ pub fn layoutIsResolved(ty: Type, zcu: *const Zcu) bool {
     };
 }
 
+/// True iff the `.normal`-strategy queries (`abiSize`, `comptimeOnly`,
+/// `hasRuntimeBits`) can be answered without hitting `unreachable`.
+/// Stronger than `layoutIsResolved`: also requires `requires_comptime` to
+/// have been decided. Recurses through optional/array/error-union/vector.
+pub fn eagerResolved(ty: Type, zcu: *const Zcu) bool {
+    const ip = &zcu.intern_pool;
+    return switch (ip.indexToKey(ty.toIntern())) {
+        .struct_type => b: {
+            const s = ip.loadStructType(ty.toIntern());
+            if (!s.haveLayout(ip)) break :b false;
+            if (s.layout == .@"packed") break :b true;
+            break :b switch (s.requiresComptime(ip)) {
+                .unknown, .wip => false,
+                .yes, .no => true,
+            };
+        },
+        .union_type => b: {
+            const u = ip.loadUnionType(ty.toIntern());
+            break :b u.haveLayout(ip) and switch (u.requiresComptime(ip)) {
+                .unknown, .wip => false,
+                .yes, .no => true,
+            };
+        },
+        .array_type => |a| if (a.lenIncludingSentinel() == 0) true else Type.fromInterned(a.child).eagerResolved(zcu),
+        .vector_type => |v| Type.fromInterned(v.child).eagerResolved(zcu),
+        .opt_type => |c| Type.fromInterned(c).eagerResolved(zcu),
+        .error_union_type => |k| Type.fromInterned(k.payload_type).eagerResolved(zcu),
+        else => true,
+    };
+}
+
 pub fn isSinglePointer(ty: Type, zcu: *const Zcu) bool {
     return switch (zcu.intern_pool.indexToKey(ty.toIntern())) {
         .ptr_type => |ptr_info| ptr_info.flags.size == .one,
@@ -2679,6 +2713,18 @@ pub fn onePossibleValue(starting_type: Type, pt: Zcu.PerThread) !?Value {
 
 /// During semantic analysis, instead call `ty.comptimeOnlySema` which
 /// resolves field types rather than asserting they are already resolved.
+/// Recurse `comptimeOnlyInner` without tripping `fromInterned`/`toIntern`
+/// asserts when `child` is an unpublished `.none` slot under parallel Sema.
+fn childComptimeOnly(
+    child: InternPool.Index,
+    comptime strat: ResolveStrat,
+    zcu: strat.ZcuPtr(),
+    tid: strat.Tid(),
+) SemaError!bool {
+    if (child == .none) return false;
+    return (Type{ .ip_index = child }).comptimeOnlyInner(strat, zcu, tid);
+}
+
 pub fn comptimeOnly(ty: Type, zcu: *const Zcu) bool {
     return ty.comptimeOnlyInner(.normal, zcu, {}) catch unreachable;
 }
@@ -2696,12 +2742,18 @@ pub fn comptimeOnlyInner(
     tid: strat.Tid(),
 ) SemaError!bool {
     const ip = &zcu.intern_pool;
+    // Under parallel Sema an unpublished struct/union field-type slot can
+    // surface here (often via tail-recursion through .opt_type/.ptr_type
+    // child); the documented contract above allows a false negative.
+    // Checked on `ip_index` directly because `toIntern()` asserts != .none.
+    if (ty.ip_index == .none) return false;
     return switch (ty.toIntern()) {
         .empty_tuple_type => false,
 
         else => switch (ip.indexToKey(ty.toIntern())) {
             .int_type => false,
             .ptr_type => |ptr_type| {
+                if (ptr_type.child == .none) return false;
                 const child_ty = Type.fromInterned(ptr_type.child);
                 switch (child_ty.zigTypeTag(zcu)) {
                     .@"fn" => return !try child_ty.fnHasRuntimeBitsInner(strat, zcu, tid),
@@ -2713,10 +2765,10 @@ pub fn comptimeOnlyInner(
                 if (child == .none) return false;
                 return Type.fromInterned(child).comptimeOnlyInner(strat, zcu, tid);
             },
-            .array_type => |array_type| return Type.fromInterned(array_type.child).comptimeOnlyInner(strat, zcu, tid),
-            .vector_type => |vector_type| return Type.fromInterned(vector_type.child).comptimeOnlyInner(strat, zcu, tid),
-            .opt_type => |child| return Type.fromInterned(child).comptimeOnlyInner(strat, zcu, tid),
-            .error_union_type => |error_union_type| return Type.fromInterned(error_union_type.payload_type).comptimeOnlyInner(strat, zcu, tid),
+            .array_type => |array_type| return childComptimeOnly(array_type.child, strat, zcu, tid),
+            .vector_type => |vector_type| return childComptimeOnly(vector_type.child, strat, zcu, tid),
+            .opt_type => |child| return childComptimeOnly(child, strat, zcu, tid),
+            .error_union_type => |error_union_type| return childComptimeOnly(error_union_type.payload_type, strat, zcu, tid),
 
             .error_set_type,
             .inferred_error_set_type,
@@ -2792,7 +2844,16 @@ pub fn comptimeOnlyInner(
                                 const i: u32 = @intCast(i_usize);
                                 if (struct_type.fieldIsComptime(ip, i)) continue;
                                 const field_ty = struct_type.field_types.get(ip)[i];
-                                if (try Type.fromInterned(field_ty).comptimeOnlyInner(strat, zcu, tid)) {
+                                // Under parallel Sema, `resolveFields` may have
+                                // returned before every slot is published; an
+                                // unpublished slot is still being resolved and
+                                // we treat the answer as unknown (false-neg ok
+                                // per the contract above).
+                                if (zcu.parallel_sema and field_ty == .none) {
+                                    struct_type.setRequiresComptime(ip, .unknown);
+                                    return false;
+                                }
+                                if (try childComptimeOnly(field_ty, strat, zcu, tid)) {
                                     // Note that this does not cause the layout to
                                     // be considered resolved. Comptime-only types
                                     // still maintain a layout of their
@@ -2812,7 +2873,7 @@ pub fn comptimeOnlyInner(
             .tuple_type => |tuple| {
                 for (tuple.types.get(ip), tuple.values.get(ip)) |field_ty, val| {
                     const have_comptime_val = val != .none;
-                    if (!have_comptime_val and try Type.fromInterned(field_ty).comptimeOnlyInner(strat, zcu, tid)) return true;
+                    if (!have_comptime_val and try childComptimeOnly(field_ty, strat, zcu, tid)) return true;
                 }
                 return false;
             },
@@ -2842,7 +2903,11 @@ pub fn comptimeOnlyInner(
 
                             for (0..union_type.field_types.len) |field_idx| {
                                 const field_ty = union_type.field_types.get(ip)[field_idx];
-                                if (try Type.fromInterned(field_ty).comptimeOnlyInner(strat, zcu, tid)) {
+                                if (zcu.parallel_sema and field_ty == .none) {
+                                    union_type.setRequiresComptime(ip, .unknown);
+                                    return false;
+                                }
+                                if (try childComptimeOnly(field_ty, strat, zcu, tid)) {
                                     union_type.setRequiresComptime(ip, .yes);
                                     return true;
                                 }
@@ -2857,7 +2922,7 @@ pub fn comptimeOnlyInner(
 
             .opaque_type => false,
 
-            .enum_type => return Type.fromInterned(ip.loadEnumType(ty.toIntern()).tag_ty).comptimeOnlyInner(strat, zcu, tid),
+            .enum_type => return childComptimeOnly(ip.loadEnumType(ty.toIntern()).tag_ty, strat, zcu, tid),
 
             // values, not types
             .undef,
@@ -2943,6 +3008,7 @@ pub fn getNamespaceIndex(ty: Type, zcu: *Zcu) InternPool.NamespaceIndex {
 /// Returns null if the type has no namespace.
 pub fn getNamespace(ty: Type, zcu: *Zcu) InternPool.OptionalNamespaceIndex {
     const ip = &zcu.intern_pool;
+    zcu.awaitNamespaceTypeFinished(ty.toIntern());
     return switch (ip.indexToKey(ty.toIntern())) {
         .opaque_type => ip.loadOpaqueType(ty.toIntern()).namespace.toOptional(),
         .struct_type => ip.loadStructType(ty.toIntern()).namespace.toOptional(),
@@ -3074,6 +3140,9 @@ pub fn enumFieldName(ty: Type, field_index: usize, zcu: *const Zcu) InternPool.N
 
 pub fn enumFieldIndex(ty: Type, field_name: InternPool.NullTerminatedString, zcu: *const Zcu) ?u32 {
     const ip = &zcu.intern_pool;
+    // The `.existing` dedup may return an enum whose `WipEnumType` owner is
+    // still populating names; spin until prepare() so the lookup sees them.
+    Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
     const enum_type = ip.loadEnumType(ty.toIntern());
     return enum_type.nameIndex(ip, field_name);
 }
@@ -3083,6 +3152,7 @@ pub fn enumFieldIndex(ty: Type, field_name: InternPool.NullTerminatedString, zcu
 /// declaration order, or `null` if `enum_tag` does not match any field.
 pub fn enumTagFieldIndex(ty: Type, enum_tag: Value, zcu: *const Zcu) ?u32 {
     const ip = &zcu.intern_pool;
+    Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
     const enum_type = ip.loadEnumType(ty.toIntern());
     const int_tag = switch (ip.indexToKey(enum_tag.toIntern())) {
         .int => enum_tag.toIntern(),
@@ -3585,10 +3655,16 @@ pub fn resolveLayout(ty: Type, pt: Zcu.PerThread) SemaError!void {
                 const field_ty = Type.fromInterned(tuple_type.types.get(ip)[i]);
                 try field_ty.resolveLayout(pt);
             },
-            .struct_type => return ty.resolveStructInner(pt, .layout),
+            .struct_type => {
+                if (ip.loadStructType(ty.toIntern()).haveLayout(ip)) return;
+                return ty.resolveStructInner(pt, .layout);
+            },
             else => unreachable,
         },
-        .@"union" => return ty.resolveUnionInner(pt, .layout),
+        .@"union" => {
+            if (ip.loadUnionType(ty.toIntern()).haveLayout(ip)) return;
+            return ty.resolveUnionInner(pt, .layout);
+        },
         .array => {
             if (ty.arrayLenIncludingSentinel(zcu) == 0) return;
             const elem_ty = ty.childType(zcu);
@@ -3706,9 +3782,15 @@ pub fn resolveFields(ty: Type, pt: Zcu.PerThread) SemaError!void {
             .type_struct,
             .type_struct_packed,
             .type_struct_packed_inits,
-            => return ty.resolveStructInner(pt, .fields),
+            => {
+                if (ip.loadStructType(ty_ip).haveFieldTypes(ip)) return;
+                return ty.resolveStructInner(pt, .fields);
+            },
 
-            .type_union => return ty.resolveUnionInner(pt, .fields),
+            .type_union => {
+                if (ip.loadUnionType(ty_ip).haveFieldTypes(ip)) return;
+                return ty.resolveUnionInner(pt, .fields);
+            },
 
             else => {},
         },
@@ -3758,24 +3840,38 @@ pub fn resolveFully(ty: Type, pt: Zcu.PerThread) SemaError!void {
                 const field_ty = Type.fromInterned(tuple_type.types.get(ip)[i]);
                 try field_ty.resolveFully(pt);
             },
-            .struct_type => return ty.resolveStructInner(pt, .full),
+            .struct_type => {
+                const s = ip.loadStructType(ty.toIntern());
+                if (s.layout != .@"packed" and s.flagsUnordered(ip).fully_resolved) return;
+                if (s.layout == .@"packed" and s.haveLayout(ip)) return;
+                return ty.resolveStructInner(pt, .full);
+            },
             else => unreachable,
         },
-        .@"union" => return ty.resolveUnionInner(pt, .full),
+        .@"union" => {
+            if (ip.loadUnionType(ty.toIntern()).flagsUnordered(ip).status == .fully_resolved) return;
+            return ty.resolveUnionInner(pt, .full);
+        },
     }
 }
 
 pub fn resolveStructFieldInits(ty: Type, pt: Zcu.PerThread) SemaError!void {
-    // TODO: stop calling this for tuples!
-    _ = pt.zcu.typeToStruct(ty) orelse return;
+    const ip = &pt.zcu.intern_pool;
+    const s = pt.zcu.typeToStruct(ty) orelse return;
+    if (s.haveFieldInits(ip)) return;
     return ty.resolveStructInner(pt, .inits);
 }
 
 pub fn resolveStructAlignment(ty: Type, pt: Zcu.PerThread) SemaError!void {
+    const ip = &pt.zcu.intern_pool;
+    const s = ip.loadStructType(ty.toIntern());
+    if (s.layout != .@"packed" and s.flagsUnordered(ip).alignment != .none) return;
     return ty.resolveStructInner(pt, .alignment);
 }
 
 pub fn resolveUnionAlignment(ty: Type, pt: Zcu.PerThread) SemaError!void {
+    const ip = &pt.zcu.intern_pool;
+    if (ip.loadUnionType(ty.toIntern()).flagsUnordered(ip).alignment != .none) return;
     return ty.resolveUnionInner(pt, .alignment);
 }
 
@@ -3788,13 +3884,46 @@ fn resolveStructInner(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
+    zcu.awaitNamespaceTypeFinished(ty.toIntern());
+
+    const ip = &zcu.intern_pool;
     const struct_obj = zcu.typeToStruct(ty).?;
     const owner: InternPool.AnalUnit = .wrap(.{ .type = ty.toIntern() });
 
-    if (zcu.failed_analysis.contains(owner) or zcu.transitive_failed_analysis.contains(owner)) {
-        return error.AnalysisFail;
+    // Under parallel Sema, gate per-type so only one thread runs this body for
+    // a given type at a time. Cross-thread waits use the claimOrWait condvar
+    // (not retry-requeue), and the wip-flags inside `Sema.resolveStruct*`
+    // revert to their original role of detecting same-thread recursion.
+    var owns_claim = false;
+    claim: while (true) {
+        // Fast-path before locking: most calls hit an already-resolved stage.
+        if (switch (resolution) {
+            .fields => struct_obj.haveFieldTypes(ip),
+            .inits => struct_obj.haveFieldInits(ip),
+            .alignment => struct_obj.layout != .@"packed" and struct_obj.flagsUnordered(ip).alignment != .none,
+            .layout => struct_obj.haveLayout(ip),
+            .full => switch (struct_obj.layout) {
+                .@"packed" => struct_obj.haveLayout(ip),
+                .auto, .@"extern" => struct_obj.flagsUnordered(ip).fully_resolved,
+            },
+        }) return;
+        switch (try zcu.claimOrWait(owner)) {
+            .claimed => {
+                owns_claim = true;
+                break :claim;
+            },
+            .recursed => break :claim,
+            .done => {
+                if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
+                continue :claim;
+            },
+        }
     }
+    defer if (owns_claim) zcu.releaseClaim(owner);
 
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+    if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
     if (zcu.comp.debugIncremental()) {
         const info = try zcu.incremental_debug_state.getUnitInfo(gpa, owner);
         info.last_update_gen = zcu.generation;
@@ -3829,6 +3958,9 @@ fn resolveStructInner(
         .full => sema.resolveStructFully(ty),
     }) catch |err| switch (err) {
         error.AnalysisFail => {
+            if (Zcu.tls_retry_loop != null) return error.AnalysisFail;
+            zcu.failed_analysis_mutex.lock();
+            defer zcu.failed_analysis_mutex.unlock();
             if (!zcu.failed_analysis.contains(owner)) {
                 try zcu.transitive_failed_analysis.put(gpa, owner, {});
             }
@@ -3847,13 +3979,38 @@ fn resolveUnionInner(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
+    zcu.awaitNamespaceTypeFinished(ty.toIntern());
+
+    const ip = &zcu.intern_pool;
     const union_obj = zcu.typeToUnion(ty).?;
     const owner: InternPool.AnalUnit = .wrap(.{ .type = ty.toIntern() });
 
-    if (zcu.failed_analysis.contains(owner) or zcu.transitive_failed_analysis.contains(owner)) {
-        return error.AnalysisFail;
+    var owns_claim = false;
+    claim: while (true) {
+        const flags = union_obj.flagsUnordered(ip);
+        if (switch (resolution) {
+            .fields => flags.status.haveFieldTypes(),
+            .alignment => flags.alignment != .none,
+            .layout => flags.status.haveLayout(),
+            .full => flags.status == .fully_resolved,
+        }) return;
+        switch (try zcu.claimOrWait(owner)) {
+            .claimed => {
+                owns_claim = true;
+                break :claim;
+            },
+            .recursed => break :claim,
+            .done => {
+                if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
+                continue :claim;
+            },
+        }
     }
+    defer if (owns_claim) zcu.releaseClaim(owner);
 
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+    if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
     if (zcu.comp.debugIncremental()) {
         const info = try zcu.incremental_debug_state.getUnitInfo(gpa, owner);
         info.last_update_gen = zcu.generation;
@@ -3887,6 +4044,9 @@ fn resolveUnionInner(
         .full => sema.resolveUnionFully(ty),
     }) catch |err| switch (err) {
         error.AnalysisFail => {
+            if (Zcu.tls_retry_loop != null) return error.AnalysisFail;
+            zcu.failed_analysis_mutex.lock();
+            defer zcu.failed_analysis_mutex.unlock();
             if (!zcu.failed_analysis.contains(owner)) {
                 try zcu.transitive_failed_analysis.put(gpa, owner, {});
             }
diff --git a/src/Zcu.zig b/src/Zcu.zig
index d26417c8f70b..c000e5000df6 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -38,6 +38,7 @@ const Alignment = InternPool.Alignment;
 const AnalUnit = InternPool.AnalUnit;
 const BuiltinFn = std.zig.BuiltinFn;
 const LlvmObject = @import("codegen/llvm.zig").Object;
+const LlvmPartitionSet = @import("codegen/llvm.zig").PartitionSet;
 const dev = @import("dev.zig");
 const Zoir = std.zig.Zoir;
 const ZonGen = std.zig.ZonGen;
@@ -57,9 +58,9 @@ comptime {
 /// General-purpose allocator. Used for both temporary and long-term storage.
 gpa: Allocator,
 comp: *Compilation,
-/// If the ZCU is emitting an LLVM object (i.e. we are using the LLVM backend), then this is the
-/// `LlvmObject` we are emitting to.
-llvm_object: ?LlvmObject.Ptr,
+/// If the ZCU is emitting via the LLVM backend, this is the set of partitioned LLVM `Object`
+/// builders we are emitting to. Phase 1: always a single-element set.
+llvm_object: ?LlvmPartitionSet.Ptr,
 
 /// Pointer to externally managed resource.
 root_mod: *Package.Module,
@@ -69,6 +70,61 @@ main_mod: *Package.Module,
 std_mod: *Package.Module,
 sema_prog_node: std.Progress.Node = .none,
 codegen_prog_node: std.Progress.Node = .none,
+/// Protects all non-InternPool Zcu maps that Sema reads/writes (failed_analysis,
+/// analysis_in_progress, exports, outdated, etc.) when analyze_func runs on
+/// worker threads. Recursive so an ensure* call can lock at entry, recurse into
+/// other ensure* calls (no-op re-lock), and unlock at exit; the only carve-out
+/// is the heavy AIR generation in `analyzeFnBody`, around which the owning
+/// worker explicitly fully releases via `semaRelease`/`semaReacquire`.
+sema_lock: std.Thread.Mutex = .{},
+sema_lock_owner: std.atomic.Value(std.Thread.Id) = .init(no_sema_owner),
+sema_lock_depth: u32 = 0,
+/// Signalled whenever a claim in `unit_claims` is released.
+sema_claim_cond: std.Thread.Condition = .{},
+/// AnalUnits currently being analysed by some worker; value is the owning tid.
+/// Guarded by `sema_lock`. A worker that finds an entry here for a unit it
+/// needs waits on `sema_claim_cond` until the entry is removed.
+unit_claims: std.AutoHashMapUnmanaged(AnalUnit, std.Thread.Id) = .empty,
+/// Tracks which unit each thread is currently waiting on, for deadlock
+/// detection in `claimOrWait`. Guarded by `sema_lock`.
+claim_waits: std.AutoHashMapUnmanaged(std.Thread.Id, AnalUnit) = .empty,
+/// Per-unit retry count for order-dependent dependency loops, to avoid
+/// livelock on a true source-level cycle. Guarded by `sema_lock`.
+sema_retry_counts: std.AutoHashMapUnmanaged(AnalUnit, u8) = .empty,
+sema_pending_jobs: std.atomic.Value(u32) = .init(0),
+/// Guards `inline_reference_frames` / `free_inline_reference_frames` so that
+/// the very hot `Inlining.refFrame` path does not contend on `sema_lock`.
+inline_ref_mutex: std.Thread.Mutex = .{},
+/// Guards `unit_claims` / `claim_waits` so `claimOrWait` does not contend on
+/// `sema_lock` (the entry-lock at ensureFuncBodyUpToDate was the hottest
+/// contention site at 12.4 s × 15 684 stalls).
+unit_claims_mutex: std.Thread.Mutex = .{},
+/// Guards `failed_analysis` + `transitive_failed_analysis`.
+failed_analysis_mutex: std.Thread.Mutex = .{},
+/// Guards `reference_table` / `all_references` / `free_references` and the
+/// `type_reference_table` / `all_type_references` / `free_type_references`.
+references_mutex: std.Thread.Mutex = .{},
+/// Guards `single_exports` / `multi_exports` / `all_exports` / `free_exports`
+/// / `failed_exports`.
+exports_mutex: std.Thread.Mutex = .{},
+/// Guards `nav_val_analysis_queued` so `ensureNavValAnalysisQueued` does not
+/// contend on `sema_lock`.
+nav_queued_mutex: std.Thread.Mutex = .{},
+/// Guards `outdated` / `potentially_outdated` / `outdated_ready`. Under
+/// non-incremental these are only touched for comptime units (scanDecl marks
+/// fresh ones, ensureComptimeUnitUpToDate consumes), so this mutex is rarely
+/// contended.
+outdated_mutex: std.Thread.Mutex = .{},
+/// Guards `test_functions`.
+test_functions_mutex: std.Thread.Mutex = .{},
+/// Guards `cimport_errors`.
+cimport_errors_mutex: std.Thread.Mutex = .{},
+/// Guards `sema_retry_counts`.
+sema_retry_mutex: std.Thread.Mutex = .{},
+/// Guards `compile_logs` + `compile_log_lines` + `free_compile_log_lines`.
+compile_log_mutex: std.Thread.Mutex = .{},
+/// True while parallel Sema is enabled for this update.
+parallel_sema: bool = false,
 /// The number of codegen jobs which are pending or in-progress. Whichever thread drops this value
 /// to 0 is responsible for ending `codegen_prog_node`. While semantic analysis is happening, this
 /// value bottoms out at 1 instead of 0, to ensure that it can only drop to 0 after analysis is
@@ -1113,6 +1169,34 @@ pub const File = struct {
         };
     }
 
+    /// Returns a stable key string used to assign this file to an LLVM codegen
+    /// shard. The key is the owning module's fully-qualified name plus the full
+    /// normalised sub_file_path, so identical source layouts hash identically
+    /// regardless of host path separator.
+    pub fn shardKey(file: File, buf: []u8) []const u8 {
+        const mod = file.mod orelse return buf[0..0];
+        const mod_name = mod.fully_qualified_name;
+        var w: usize = @min(mod_name.len, buf.len);
+        @memcpy(buf[0..w], mod_name[0..w]);
+        if (w < buf.len) {
+            buf[w] = '/';
+            w += 1;
+        }
+        for (file.sub_file_path) |c| {
+            if (w >= buf.len) break;
+            buf[w] = if (c == '\\') '/' else c;
+            w += 1;
+        }
+        return buf[0..w];
+    }
+
+    pub fn computeShard(file: File, n: u32) u8 {
+        if (n <= 1) return 0;
+        var buf: [512]u8 = undefined;
+        const key = file.shardKey(&buf);
+        return @intCast(std.hash.Wyhash.hash(0, key) % n);
+    }
+
     pub fn internFullyQualifiedName(file: File, pt: Zcu.PerThread) !InternPool.NullTerminatedString {
         const gpa = pt.zcu.gpa;
         const ip = &pt.zcu.intern_pool;
@@ -2775,6 +2859,9 @@ pub fn deinit(zcu: *Zcu) void {
         for (zcu.failed_codegen.values()) |value| value.destroy(gpa);
         for (zcu.failed_types.values()) |value| value.destroy(gpa);
         zcu.analysis_in_progress.deinit(gpa);
+        zcu.unit_claims.deinit(gpa);
+        zcu.claim_waits.deinit(gpa);
+        zcu.sema_retry_counts.deinit(gpa);
         zcu.failed_analysis.deinit(gpa);
         zcu.transitive_failed_analysis.deinit(gpa);
         zcu.failed_codegen.deinit(gpa);
@@ -3456,6 +3543,19 @@ pub fn ensureFuncBodyAnalysisQueued(zcu: *Zcu, func_index: InternPool.Index) !vo
 
     assert(func.ty == func.uncoerced_ty); // analyze the body of the original function, not a coerced one
 
+    if (zcu.parallel_sema and !zcu.comp.incremental) {
+        // Lock-free dedup via the per-func atomic `is_queued` bit instead of
+        // contending on `sema_lock` for the global `func_body_analysis_queued`
+        // set (this site is hit ~40k× and was the second-hottest contention
+        // point in the profile).
+        if (!func.trySetQueued(ip)) return;
+        try zcu.comp.queueJob(.{ .analyze_func = func_index });
+        return;
+    }
+
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+
     if (zcu.func_body_analysis_queued.contains(func_index)) return;
 
     if (func.analysisUnordered(ip).is_analyzed) {
@@ -3475,6 +3575,20 @@ pub fn ensureFuncBodyAnalysisQueued(zcu: *Zcu, func_index: InternPool.Index) !vo
 pub fn ensureNavValAnalysisQueued(zcu: *Zcu, nav_id: InternPool.Nav.Index) !void {
     const ip = &zcu.intern_pool;
 
+    if (zcu.parallel_sema and !zcu.comp.incremental) {
+        if (ip.getNav(nav_id).status == .fully_resolved) return;
+        zcu.nav_queued_mutex.lock();
+        defer zcu.nav_queued_mutex.unlock();
+        if (zcu.nav_val_analysis_queued.contains(nav_id)) return;
+        try zcu.nav_val_analysis_queued.ensureUnusedCapacity(zcu.gpa, 1);
+        try zcu.comp.queueJob(.{ .analyze_comptime_unit = .wrap(.{ .nav_val = nav_id }) });
+        zcu.nav_val_analysis_queued.putAssumeCapacityNoClobber(nav_id, {});
+        return;
+    }
+
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+
     if (zcu.nav_val_analysis_queued.contains(nav_id)) return;
 
     if (ip.getNav(nav_id).status == .fully_resolved) {
@@ -3507,9 +3621,173 @@ pub const ImportResult = struct {
     module: ?*Package.Module,
 };
 
+pub const no_sema_owner: std.Thread.Id = std.math.maxInt(std.Thread.Id);
+
+/// Recursive acquire of `sema_lock` if parallel Sema is active. No-op otherwise.
+pub fn semaLock(zcu: *Zcu) void {
+    if (!zcu.parallel_sema) return;
+    const me = std.Thread.getCurrentId();
+    if (zcu.sema_lock_owner.load(.acquire) == me) {
+        zcu.sema_lock_depth += 1;
+        return;
+    }
+    zcu.sema_lock.lock();
+    zcu.sema_lock_owner.store(me, .release);
+    zcu.sema_lock_depth = 1;
+}
+pub fn semaUnlock(zcu: *Zcu) void {
+    if (!zcu.parallel_sema) return;
+    zcu.sema_lock_depth -= 1;
+    if (zcu.sema_lock_depth == 0) {
+        zcu.sema_lock_owner.store(no_sema_owner, .release);
+        zcu.sema_lock.unlock();
+    }
+}
+/// Fully release the recursive lock (returning the saved depth) so other
+/// workers can proceed during long unlocked sections. Returns 0 if not held.
+pub fn semaRelease(zcu: *Zcu) u32 {
+    if (!zcu.parallel_sema) return 0;
+    const me = std.Thread.getCurrentId();
+    if (zcu.sema_lock_owner.load(.acquire) != me) return 0;
+    const d = zcu.sema_lock_depth;
+    zcu.sema_lock_depth = 0;
+    zcu.sema_lock_owner.store(no_sema_owner, .release);
+    zcu.sema_lock.unlock();
+    return d;
+}
+pub fn semaReacquire(zcu: *Zcu, depth: u32) void {
+    if (!zcu.parallel_sema or depth == 0) return;
+    const me = std.Thread.getCurrentId();
+    zcu.sema_lock.lock();
+    zcu.sema_lock_owner.store(me, .release);
+    zcu.sema_lock_depth = depth;
+}
+
+/// Types this thread is currently in the wip-populate phase for. The
+/// namespace sentinel for these is intentionally still set; same-thread
+/// recursion (e.g. an enum field value referencing an earlier field) must
+/// not spin on it.
+threadlocal var tls_wip_types: std.AutoArrayHashMapUnmanaged(InternPool.Index, void) = .empty;
+
+pub fn wipTypeEnter(zcu: *Zcu, ty: InternPool.Index) Allocator.Error!void {
+    if (!zcu.parallel_sema) return;
+    try tls_wip_types.put(zcu.gpa, ty, {});
+}
+pub fn wipTypeExit(zcu: *Zcu, ty: InternPool.Index) void {
+    if (!zcu.parallel_sema) return;
+    _ = tls_wip_types.swapRemove(ty);
+}
+
+pub fn awaitNamespaceTypeFinished(zcu: *Zcu, ty: InternPool.Index) void {
+    awaitNamespaceTypeFinishedConst(zcu, ty);
+}
+pub fn awaitNamespaceTypeFinishedConst(zcu: *const Zcu, ty: InternPool.Index) void {
+    if (!zcu.parallel_sema) return;
+    if (tls_wip_types.contains(ty)) return;
+    zcu.intern_pool.awaitNamespaceTypeFinished(ty);
+}
+
+/// Try to claim `unit` for analysis on behalf of `tid`. Returns:
+///  - `.claimed` if the caller now owns analysis of this unit and must call
+///    `releaseClaim` when done.
+///  - `.recursed` if this thread already owns it (dependency-loop detection
+///    handled by caller as before via `analysis_in_progress`).
+///  - `.done` if another thread finished analysing it while we waited; caller
+///    should re-read the unit's resolved status and return.
+/// Uses its own `unit_claims_mutex`; may temporarily release any held
+/// `sema_lock` while waiting on the per-unit condvar.
+pub fn claimOrWait(zcu: *Zcu, unit: AnalUnit) Allocator.Error!enum { claimed, recursed, done } {
+    if (!zcu.parallel_sema) return .claimed;
+    const me = std.Thread.getCurrentId();
+    zcu.unit_claims_mutex.lock();
+    defer zcu.unit_claims_mutex.unlock();
+    while (true) {
+        const gop = try zcu.unit_claims.getOrPut(zcu.gpa, unit);
+        if (!gop.found_existing) {
+            gop.value_ptr.* = me;
+            return .claimed;
+        }
+        if (gop.value_ptr.* == me) return .recursed;
+        var chain_unit = unit;
+        var hops: u32 = 0;
+        while (hops < 64) : (hops += 1) {
+            const holder = zcu.unit_claims.get(chain_unit) orelse break;
+            if (holder == me) return .recursed;
+            chain_unit = zcu.claim_waits.get(holder) orelse break;
+        }
+        // Another thread holds the claim; record our wait, fully release any
+        // held sema_lock, then sleep on the dedicated claims condvar.
+        try zcu.claim_waits.put(zcu.gpa, me, unit);
+        zcu.unit_claims_mutex.unlock();
+        const d = zcu.semaRelease();
+        zcu.unit_claims_mutex.lock();
+        // Lost-wakeup guard: holder may have released between our two locks.
+        if (zcu.unit_claims.contains(unit))
+            zcu.sema_claim_cond.wait(&zcu.unit_claims_mutex);
+        _ = zcu.claim_waits.remove(me);
+        zcu.unit_claims_mutex.unlock();
+        zcu.semaReacquire(d);
+        zcu.unit_claims_mutex.lock();
+        // After wake, check whether the unit is now resolved; if the claim is
+        // gone, another thread finished it.
+        if (!zcu.unit_claims.contains(unit)) return .done;
+    }
+}
+
+pub fn releaseClaim(zcu: *Zcu, unit: AnalUnit) void {
+    if (!zcu.parallel_sema) return;
+    zcu.unit_claims_mutex.lock();
+    _ = zcu.unit_claims.remove(unit);
+    zcu.unit_claims_mutex.unlock();
+    zcu.sema_claim_cond.broadcast();
+}
+
+
+/// Under parallel Sema, `analysis_in_progress` is per-OS-thread (lock-free).
+threadlocal var tls_aip: std.AutoArrayHashMapUnmanaged(AnalUnit, void) = .empty;
+/// Set by `ensureNavResolved` when a dependency loop is detected under
+/// parallel Sema that may be order-dependent (the looped-on unit might be
+/// resolvable by another thread). Consumed by the outer `ensure*UpToDate`
+/// to release-and-requeue instead of marking the unit failed.
+pub threadlocal var tls_retry_loop: ?AnalUnit = null;
+
+pub fn semaAipContains(zcu: *Zcu, unit: AnalUnit) bool {
+    if (!zcu.parallel_sema) return zcu.analysis_in_progress.contains(unit);
+    return tls_aip.contains(unit);
+}
+
+pub fn dumpTlsAip(zcu: *Zcu) void {
+    std.debug.print("tls_aip ({d} entries):\n", .{tls_aip.count()});
+    for (tls_aip.keys()) |k| std.debug.print("  {f}\n", .{zcu.fmtAnalUnit(k)});
+}
+
+pub fn aipPut(zcu: *Zcu, gpa: Allocator, unit: AnalUnit) Allocator.Error!void {
+    if (zcu.parallel_sema) {
+        try tls_aip.put(gpa, unit, {});
+        return;
+    }
+    try zcu.analysis_in_progress.putNoClobber(gpa, unit, {});
+}
+pub fn aipRemove(zcu: *Zcu, unit: AnalUnit) void {
+    if (zcu.parallel_sema) {
+        _ = tls_aip.swapRemove(unit);
+        return;
+    }
+    // Idempotent: success-path removes happen earlier than the matching
+    // `errdefer` in some callers (e.g. `analyzeNavVal`), so a second call here
+    // is benign. Asserting on it regressed serial-mode behaviour.
+    _ = zcu.analysis_in_progress.swapRemove(unit);
+}
+
 /// Delete all the Export objects that are caused by this `AnalUnit`. Re-analysis of
 /// this `AnalUnit` will cause them to be re-created (or not).
 pub fn deleteUnitExports(zcu: *Zcu, anal_unit: AnalUnit) void {
+    zcu.exports_mutex.lock();
+    defer zcu.exports_mutex.unlock();
+    zcu.deleteUnitExportsAssumeLocked(anal_unit);
+}
+
+pub fn deleteUnitExportsAssumeLocked(zcu: *Zcu, anal_unit: AnalUnit) void {
     const gpa = zcu.gpa;
 
     const exports_base, const exports_len = if (zcu.single_exports.fetchSwapRemove(anal_unit)) |kv|
@@ -3529,9 +3807,9 @@ pub fn deleteUnitExports(zcu: *Zcu, anal_unit: AnalUnit) void {
     if (dev.env.supports(.incremental)) {
         for (exports, exports_base..) |exp, export_index_usize| {
             const export_idx: Export.Index = @enumFromInt(export_index_usize);
-            if (zcu.comp.bin_file) |lf| {
+            if (zcu.llvm_object == null) if (zcu.comp.bin_file) |lf| {
                 lf.deleteExport(exp.exported, exp.opts.name);
-            }
+            };
             if (zcu.failed_exports.fetchSwapRemove(export_idx)) |failed_kv| {
                 failed_kv.value.destroy(gpa);
             }
@@ -3553,6 +3831,9 @@ pub fn deleteUnitExports(zcu: *Zcu, anal_unit: AnalUnit) void {
 pub fn deleteUnitReferences(zcu: *Zcu, anal_unit: AnalUnit) void {
     const gpa = zcu.gpa;
 
+    zcu.references_mutex.lock();
+    defer zcu.references_mutex.unlock();
+
     zcu.clearCachedResolvedReferences();
 
     unit_refs: {
@@ -3574,11 +3855,15 @@ pub fn deleteUnitReferences(zcu: *Zcu, anal_unit: AnalUnit) void {
                 // detect this case to avoid adding it to `free_inline_reference_frames` more
                 // than once. We do that by setting `parent` to itself as a marker.
                 if (inline_frame.ptr(zcu).parent == inline_frame.toOptional()) break;
-                zcu.free_inline_reference_frames.append(gpa, inline_frame) catch {
-                    // This space will be reused eventually, so we need not propagate this error.
-                    // Just leak it for now, and let GC reclaim it later on.
-                    break :unit_refs;
-                };
+                {
+                    zcu.inline_ref_mutex.lock();
+                    defer zcu.inline_ref_mutex.unlock();
+                    zcu.free_inline_reference_frames.append(gpa, inline_frame) catch {
+                        // This space will be reused eventually, so we need not propagate this error.
+                        // Just leak it for now, and let GC reclaim it later on.
+                        break :unit_refs;
+                    };
+                }
                 opt_inline_frame = inline_frame.ptr(zcu).parent;
                 inline_frame.ptr(zcu).parent = inline_frame.toOptional(); // signal to code above
             }
@@ -3603,6 +3888,8 @@ pub fn deleteUnitReferences(zcu: *Zcu, anal_unit: AnalUnit) void {
 /// Delete all compile logs performed by this `AnalUnit`.
 /// Re-analysis of the `AnalUnit` will cause logs to be rediscovered.
 pub fn deleteUnitCompileLogs(zcu: *Zcu, anal_unit: AnalUnit) void {
+    zcu.compile_log_mutex.lock();
+    defer zcu.compile_log_mutex.unlock();
     const kv = zcu.compile_logs.fetchSwapRemove(anal_unit) orelse return;
     const gpa = zcu.gpa;
     var opt_line_idx = kv.value.first_line.toOptional();
@@ -3617,6 +3904,8 @@ pub fn deleteUnitCompileLogs(zcu: *Zcu, anal_unit: AnalUnit) void {
 }
 
 pub fn addInlineReferenceFrame(zcu: *Zcu, frame: InlineReferenceFrame) Allocator.Error!Zcu.InlineReferenceFrame.Index {
+    zcu.inline_ref_mutex.lock();
+    defer zcu.inline_ref_mutex.unlock();
     const frame_idx: InlineReferenceFrame.Index = zcu.free_inline_reference_frames.pop() orelse idx: {
         _ = try zcu.inline_reference_frames.addOne(zcu.gpa);
         break :idx @enumFromInt(zcu.inline_reference_frames.items.len - 1);
@@ -3634,6 +3923,9 @@ pub fn addUnitReference(
 ) Allocator.Error!void {
     const gpa = zcu.gpa;
 
+    zcu.references_mutex.lock();
+    defer zcu.references_mutex.unlock();
+
     zcu.clearCachedResolvedReferences();
 
     try zcu.reference_table.ensureUnusedCapacity(gpa, 1);
@@ -3660,6 +3952,9 @@ pub fn addUnitReference(
 pub fn addTypeReference(zcu: *Zcu, src_unit: AnalUnit, referenced_type: InternPool.Index, ref_src: LazySrcLoc) Allocator.Error!void {
     const gpa = zcu.gpa;
 
+    zcu.references_mutex.lock();
+    defer zcu.references_mutex.unlock();
+
     zcu.clearCachedResolvedReferences();
 
     try zcu.type_reference_table.ensureUnusedCapacity(gpa, 1);
@@ -3749,8 +4044,52 @@ pub fn handleUpdateExports(
     };
 }
 
+/// Locked check whether `unit` has a (transitive) analysis failure.
+/// `failed_analysis` writers hold `failed_analysis_mutex`; under parallel Sema
+/// a concurrent rehash during `.contains` is unsafe.
+pub fn anyAnalysisFailed(zcu: *Zcu, unit: AnalUnit) bool {
+    zcu.failed_analysis_mutex.lock();
+    defer zcu.failed_analysis_mutex.unlock();
+    return zcu.failed_analysis.contains(unit) or zcu.transitive_failed_analysis.contains(unit);
+}
+
+/// Locked accessors so writers and `anyAnalysisFailed` readers agree on the
+/// same mutex (otherwise `.contains` can observe a mid-rehash map).
+pub fn putTransitiveFailed(zcu: *Zcu, unit: AnalUnit) Allocator.Error!void {
+    zcu.failed_analysis_mutex.lock();
+    defer zcu.failed_analysis_mutex.unlock();
+    try zcu.transitive_failed_analysis.put(zcu.gpa, unit, {});
+}
+
+/// Locked: mark `unit` as transitively-failed only if it has no direct
+/// `failed_analysis` entry (the common post-AnalysisFail bookkeeping).
+pub fn markTransitiveFailed(zcu: *Zcu, unit: AnalUnit) Allocator.Error!void {
+    zcu.failed_analysis_mutex.lock();
+    defer zcu.failed_analysis_mutex.unlock();
+    if (zcu.failed_analysis.contains(unit)) return;
+    try zcu.transitive_failed_analysis.put(zcu.gpa, unit, {});
+}
+
+pub fn clearAnalysisFailures(zcu: *Zcu, unit: AnalUnit) ?*ErrorMsg {
+    zcu.failed_analysis_mutex.lock();
+    defer zcu.failed_analysis_mutex.unlock();
+    const msg: ?*ErrorMsg = if (zcu.failed_analysis.fetchSwapRemove(unit)) |kv| kv.value else null;
+    _ = zcu.transitive_failed_analysis.swapRemove(unit);
+    return msg;
+}
+
+pub fn failedAnalysisGetOrPut(zcu: *Zcu, unit: AnalUnit, msg: *ErrorMsg) Allocator.Error!bool {
+    zcu.failed_analysis_mutex.lock();
+    defer zcu.failed_analysis_mutex.unlock();
+    const gop = try zcu.failed_analysis.getOrPut(zcu.gpa, unit);
+    if (!gop.found_existing) gop.value_ptr.* = msg;
+    return gop.found_existing;
+}
+
 pub fn addGlobalAssembly(zcu: *Zcu, unit: AnalUnit, source: []const u8) !void {
     const gpa = zcu.gpa;
+    zcu.semaLock();
+    defer zcu.semaUnlock();
     const gop = try zcu.global_assembly.getOrPut(gpa, unit);
     if (gop.found_existing) {
         const new_value = try std.fmt.allocPrint(gpa, "{s}\n{s}", .{ gop.value_ptr.*, source });
@@ -4307,6 +4646,13 @@ pub fn navFileScope(zcu: *Zcu, nav: InternPool.Nav.Index) *File {
     return zcu.fileByIndex(zcu.navFileScopeIndex(nav));
 }
 
+pub fn navShard(zcu: *Zcu, nav: InternPool.Nav.Index, n: u32) u32 {
+    if (n <= 1) return 0;
+    const ip = &zcu.intern_pool;
+    const fqn = ip.getNav(nav).fqn.toSlice(ip);
+    return @intCast(std.hash.Wyhash.hash(0, fqn) % n);
+}
+
 pub fn fmtAnalUnit(zcu: *Zcu, unit: AnalUnit) std.fmt.Formatter(FormatAnalUnit, formatAnalUnit) {
     return .{ .data = .{ .unit = unit, .zcu = zcu } };
 }
@@ -4743,7 +5089,9 @@ const TrackedUnitSema = struct {
     old_name: ?[std.Progress.Node.max_name_len]u8,
     old_analysis_timer: ?Compilation.Timer,
     analysis_timer_decl: ?InternPool.TrackedInst.Index,
+    is_noop: bool = false,
     pub fn end(tus: TrackedUnitSema, zcu: *Zcu) void {
+        if (tus.is_noop) return;
         const comp = zcu.comp;
         if (tus.old_name) |old_name| {
             zcu.sema_prog_node.completeOne(); // we're just renaming, but it's effectively completion
@@ -4773,6 +5121,12 @@ const TrackedUnitSema = struct {
     }
 };
 pub fn trackUnitSema(zcu: *Zcu, name: []const u8, zir_inst: ?InternPool.TrackedInst.Index) TrackedUnitSema {
+    if (zcu.parallel_sema) return .{
+        .old_name = null,
+        .old_analysis_timer = null,
+        .analysis_timer_decl = zir_inst,
+        .is_noop = true,
+    };
     if (zcu.cur_analysis_timer) |*t| t.pause();
     const old_analysis_timer = zcu.cur_analysis_timer;
     zcu.cur_analysis_timer = zcu.comp.startTimer();
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index a4024be07a9f..00bb6b89f7dc 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -611,6 +611,9 @@ pub fn ensureFileAnalyzed(pt: Zcu.PerThread, file_index: Zcu.File.Index) Zcu.Sem
             else => |e| return e,
         }
     }
+    pt.zcu.semaLock();
+    defer pt.zcu.semaUnlock();
+    if (pt.zcu.fileRootType(file_index) != .none) return;
     return pt.semaFile(file_index);
 }
 
@@ -628,10 +631,37 @@ pub fn ensureMemoizedStateUpToDate(pt: Zcu.PerThread, stage: InternPool.Memoized
 
     log.debug("ensureMemoizedStateUpToDate", .{});
 
-    assert(!zcu.analysis_in_progress.contains(unit));
+    if (zcu.parallel_sema and !zcu.comp.incremental) {
+        // Probe the *last* entry written for each stage so we never observe a
+        // partially-populated stage as complete. Paired with the release fence
+        // in `Sema.analyzeMemoizedState`.
+        const to_check: Zcu.BuiltinDecl = switch (stage) {
+            .main => .@"Type.Declaration",
+            .panic => .@"panic.noreturnReturned",
+            .va_list => .VaList,
+            .assembly => .@"assembly.Clobbers",
+        };
+        if (@atomicLoad(InternPool.Index, zcu.builtin_decl_values.getPtrConst(to_check), .acquire) != .none) return;
+    }
 
-    const was_outdated = zcu.outdated.swapRemove(unit) or zcu.potentially_outdated.swapRemove(unit);
-    const prev_failed = zcu.failed_analysis.contains(unit) or zcu.transitive_failed_analysis.contains(unit);
+    switch (try zcu.claimOrWait(unit)) {
+        .claimed => {},
+        .recursed => return error.AnalysisFail,
+        .done => {
+            if (zcu.anyAnalysisFailed(unit)) return error.AnalysisFail;
+            return;
+        },
+    }
+    defer zcu.releaseClaim(unit);
+
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
+    if (!zcu.parallel_sema) assert(!zcu.analysis_in_progress.contains(unit));
+
+    const was_outdated = zcu.comp.incremental and
+        (zcu.outdated.swapRemove(unit) or zcu.potentially_outdated.swapRemove(unit));
+    const prev_failed = zcu.anyAnalysisFailed(unit);
 
     if (was_outdated) {
         dev.check(.incremental);
@@ -639,18 +669,16 @@ pub fn ensureMemoizedStateUpToDate(pt: Zcu.PerThread, stage: InternPool.Memoized
         // No need for `deleteUnitExports` because we never export anything.
         zcu.deleteUnitReferences(unit);
         zcu.deleteUnitCompileLogs(unit);
-        if (zcu.failed_analysis.fetchSwapRemove(unit)) |kv| {
-            kv.value.destroy(gpa);
-        }
-        _ = zcu.transitive_failed_analysis.swapRemove(unit);
+        if (zcu.clearAnalysisFailures(unit)) |msg| msg.destroy(gpa);
     } else {
         if (prev_failed) return error.AnalysisFail;
-        // We use an arbitrary element to check if the state has been resolved yet.
+        // Probe the *last* entry written for each stage so we never observe a
+        // partially-populated stage as complete.
         const to_check: Zcu.BuiltinDecl = switch (stage) {
-            .main => .Type,
-            .panic => .panic,
+            .main => .@"Type.Declaration",
+            .panic => .@"panic.noreturnReturned",
             .va_list => .VaList,
-            .assembly => .assembly,
+            .assembly => .@"assembly.Clobbers",
         };
         if (zcu.builtin_decl_values.get(to_check) != .none) return;
     }
@@ -665,10 +693,11 @@ pub fn ensureMemoizedStateUpToDate(pt: Zcu.PerThread, stage: InternPool.Memoized
         .{ any_changed or prev_failed, false }
     else |err| switch (err) {
         error.AnalysisFail => res: {
-            if (!zcu.failed_analysis.contains(unit)) {
+            if (Zcu.tls_retry_loop != null) return error.AnalysisFail;
+            {
                 // If this unit caused the error, it would have an entry in `failed_analysis`.
                 // Since it does not, this must be a transitive failure.
-                try zcu.transitive_failed_analysis.put(gpa, unit, {});
+                try zcu.markTransitiveFailed(unit);
                 log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(unit)});
             }
             break :res .{ !prev_failed, true };
@@ -700,8 +729,8 @@ fn analyzeMemoizedState(pt: Zcu.PerThread, stage: InternPool.MemoizedStateStage)
 
     const unit: AnalUnit = .wrap(.{ .memoized_state = stage });
 
-    try zcu.analysis_in_progress.putNoClobber(gpa, unit, {});
-    defer assert(zcu.analysis_in_progress.swapRemove(unit));
+    try zcu.aipPut(gpa, unit);
+    defer zcu.aipRemove(unit);
 
     // Before we begin, collect:
     // * The type `std`, and its namespace
@@ -776,7 +805,20 @@ pub fn ensureComptimeUnitUpToDate(pt: Zcu.PerThread, cu_id: InternPool.ComptimeU
 
     log.debug("ensureComptimeUnitUpToDate {f}", .{zcu.fmtAnalUnit(anal_unit)});
 
-    assert(!zcu.analysis_in_progress.contains(anal_unit));
+    switch (try zcu.claimOrWait(anal_unit)) {
+        .claimed => {},
+        .recursed => return error.AnalysisFail,
+        .done => {
+            if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
+            return;
+        },
+    }
+    defer zcu.releaseClaim(anal_unit);
+
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
+    if (!zcu.parallel_sema) assert(!zcu.analysis_in_progress.contains(anal_unit));
 
     // Determine whether or not this `ComptimeUnit` is outdated. For this kind of `AnalUnit`, that's
     // the only indicator as to whether or not analysis is required; when a `ComptimeUnit` is first
@@ -786,27 +828,31 @@ pub fn ensureComptimeUnitUpToDate(pt: Zcu.PerThread, cu_id: InternPool.ComptimeU
     // ensure that the unit is definitely up-to-date when this function returns. This mechanism could
     // result in over-analysis if analysis occurs in a poor order; we do our best to avoid this by
     // carefully choosing which units to re-analyze. See `Zcu.findOutdatedToAnalyze`.
-
-    const was_outdated = zcu.outdated.swapRemove(anal_unit) or
-        zcu.potentially_outdated.swapRemove(anal_unit);
+    //
+    // comptime units are put in `outdated` by scanDecl even non-incrementally,
+    // so this check is unconditional but uses `outdated_mutex` when sema_lock
+    // is not held.
+
+    const was_outdated = blk: {
+        if (!need_sema_lock) zcu.outdated_mutex.lock();
+        defer if (!need_sema_lock) zcu.outdated_mutex.unlock();
+        const o = zcu.outdated.swapRemove(anal_unit) or zcu.potentially_outdated.swapRemove(anal_unit);
+        if (o) _ = zcu.outdated_ready.swapRemove(anal_unit);
+        break :blk o;
+    };
 
     if (was_outdated) {
-        _ = zcu.outdated_ready.swapRemove(anal_unit);
         // `was_outdated` can be true in the initial update for comptime units, so this isn't a `dev.check`.
         if (dev.env.supports(.incremental)) {
             zcu.deleteUnitExports(anal_unit);
             zcu.deleteUnitReferences(anal_unit);
             zcu.deleteUnitCompileLogs(anal_unit);
-            if (zcu.failed_analysis.fetchSwapRemove(anal_unit)) |kv| {
-                kv.value.destroy(gpa);
-            }
-            _ = zcu.transitive_failed_analysis.swapRemove(anal_unit);
+            if (zcu.clearAnalysisFailures(anal_unit)) |msg| msg.destroy(gpa);
             zcu.intern_pool.removeDependenciesForDepender(gpa, anal_unit);
         }
     } else {
         // We can trust the current information about this unit.
-        if (zcu.failed_analysis.contains(anal_unit)) return error.AnalysisFail;
-        if (zcu.transitive_failed_analysis.contains(anal_unit)) return error.AnalysisFail;
+        if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
         return;
     }
 
@@ -824,12 +870,16 @@ pub fn ensureComptimeUnitUpToDate(pt: Zcu.PerThread, cu_id: InternPool.ComptimeU
 
     return pt.analyzeComptimeUnit(cu_id) catch |err| switch (err) {
         error.AnalysisFail => {
-            if (!zcu.failed_analysis.contains(anal_unit)) {
-                // If this unit caused the error, it would have an entry in `failed_analysis`.
-                // Since it does not, this must be a transitive failure.
-                try zcu.transitive_failed_analysis.put(gpa, anal_unit, {});
-                log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
+            if (Zcu.tls_retry_loop != null) {
+                // Re-mark outdated so the re-queued attempt actually re-runs
+                // instead of taking the was_outdated=false early return.
+                zcu.outdated_mutex.lock();
+                defer zcu.outdated_mutex.unlock();
+                try zcu.outdated.put(gpa, anal_unit, 0);
+                return error.AnalysisFail;
             }
+            try zcu.markTransitiveFailed(anal_unit);
+            log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
             return error.AnalysisFail;
         },
         error.OutOfMemory => {
@@ -864,8 +914,8 @@ fn analyzeComptimeUnit(pt: Zcu.PerThread, cu_id: InternPool.ComptimeUnit.Id) Zcu
     const file = zcu.fileByIndex(inst_resolved.file);
     const zir = file.zir.?;
 
-    try zcu.analysis_in_progress.putNoClobber(gpa, anal_unit, {});
-    defer assert(zcu.analysis_in_progress.swapRemove(anal_unit));
+    try zcu.aipPut(gpa, anal_unit);
+    defer zcu.aipRemove(anal_unit);
 
     var analysis_arena: std.heap.ArenaAllocator = .init(gpa);
     defer analysis_arena.deinit();
@@ -951,14 +1001,44 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
     const gpa = zcu.gpa;
     const ip = &zcu.intern_pool;
 
-    _ = zcu.nav_val_analysis_queued.swapRemove(nav_id);
-
     const anal_unit: AnalUnit = .wrap(.{ .nav_val = nav_id });
     const nav = ip.getNav(nav_id);
 
     log.debug("ensureNavValUpToDate {f}", .{zcu.fmtAnalUnit(anal_unit)});
 
-    assert(!zcu.analysis_in_progress.contains(anal_unit));
+    if (zcu.parallel_sema and !zcu.comp.incremental and nav.status == .fully_resolved) return;
+
+    claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
+        .claimed => break :claim,
+        .recursed => return error.AnalysisFail,
+        .done => {
+            if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
+            // The previous holder may have released its claim via a retry-abort
+            // (yield-and-requeue) without actually resolving the nav. Re-check
+            // the resolved status and loop back to claim if not.
+            if (ip.getNav(nav_id).status == .fully_resolved) return;
+            continue :claim;
+        },
+    };
+    defer zcu.releaseClaim(anal_unit);
+
+    // Under parallel + non-incremental, `outdated`/`potentially_outdated` are
+    // always empty so `was_outdated` is always false; nav_queued and
+    // failed_analysis have their own mutexes; the body's Sema writes go
+    // through per-map mutexes. The whole-function sema_lock is the largest
+    // serializer in the profile and is only needed for incremental
+    // bookkeeping.
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
+
+    {
+        zcu.nav_queued_mutex.lock();
+        defer zcu.nav_queued_mutex.unlock();
+        _ = zcu.nav_val_analysis_queued.swapRemove(nav_id);
+    }
+
+    if (!zcu.parallel_sema) assert(!zcu.analysis_in_progress.contains(anal_unit));
 
     // Determine whether or not this `Nav`'s value is outdated. This also includes checking if the
     // status is `.unresolved`, which indicates that the value is outdated because it has *never*
@@ -969,11 +1049,10 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
     // result in over-analysis if analysis occurs in a poor order; we do our best to avoid this by
     // carefully choosing which units to re-analyze. See `Zcu.findOutdatedToAnalyze`.
 
-    const was_outdated = zcu.outdated.swapRemove(anal_unit) or
-        zcu.potentially_outdated.swapRemove(anal_unit);
+    const was_outdated = zcu.comp.incremental and
+        (zcu.outdated.swapRemove(anal_unit) or zcu.potentially_outdated.swapRemove(anal_unit));
 
-    const prev_failed = zcu.failed_analysis.contains(anal_unit) or
-        zcu.transitive_failed_analysis.contains(anal_unit);
+    const prev_failed = zcu.anyAnalysisFailed(anal_unit);
 
     if (was_outdated) {
         dev.check(.incremental);
@@ -981,10 +1060,7 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
         zcu.deleteUnitExports(anal_unit);
         zcu.deleteUnitReferences(anal_unit);
         zcu.deleteUnitCompileLogs(anal_unit);
-        if (zcu.failed_analysis.fetchSwapRemove(anal_unit)) |kv| {
-            kv.value.destroy(gpa);
-        }
-        _ = zcu.transitive_failed_analysis.swapRemove(anal_unit);
+        if (zcu.clearAnalysisFailures(anal_unit)) |msg| msg.destroy(gpa);
         ip.removeDependenciesForDepender(gpa, anal_unit);
     } else {
         // We can trust the current information about this unit.
@@ -1012,12 +1088,21 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
         };
     } else |err| switch (err) {
         error.AnalysisFail => res: {
-            if (!zcu.failed_analysis.contains(anal_unit)) {
-                // If this unit caused the error, it would have an entry in `failed_analysis`.
-                // Since it does not, this must be a transitive failure.
-                try zcu.transitive_failed_analysis.put(gpa, anal_unit, {});
-                log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
+            if (Zcu.tls_retry_loop != null) {
+                // The retry may have triggered after `resolveNavValue` committed
+                // (status .fully_resolved), in which case re-queue is a no-op
+                // and the post-commit work (link_nav, body analysis) is lost.
+                // Queue it here so processExports/codegen sees the definition.
+                if (ip.getNav(nav_id).status == .fully_resolved) {
+                    const v = zcu.navValue(nav_id).toIntern();
+                    if (ip.isFuncBody(v))
+                        zcu.ensureFuncBodyAnalysisQueued(v) catch return error.OutOfMemory;
+                    zcu.comp.queueJob(.{ .link_nav = nav_id }) catch return error.OutOfMemory;
+                }
+                return error.AnalysisFail;
             }
+            try zcu.markTransitiveFailed(anal_unit);
+            log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
             break :res .{ !prev_failed, true };
         },
         error.OutOfMemory => {
@@ -1058,6 +1143,13 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
         // The type does indeed depend on the value. We are responsible for populating all state of
         // the `nav_ty`, including exports, references, errors, and dependencies.
         const ty_unit: AnalUnit = .wrap(.{ .nav_ty = nav_id });
+        // Dependency/outdated bookkeeping is only meaningful under
+        // incremental; under parallel non-incremental these maps are not
+        // sema_lock-guarded here, so writing them is unsafe and pointless.
+        if (!zcu.comp.incremental) {
+            if (new_failed) try zcu.putTransitiveFailed(ty_unit);
+            break :type_deps_on_val;
+        }
         const ty_was_outdated = zcu.outdated.swapRemove(ty_unit) or
             zcu.potentially_outdated.swapRemove(ty_unit);
         if (ty_was_outdated) {
@@ -1065,14 +1157,11 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
             zcu.deleteUnitExports(ty_unit);
             zcu.deleteUnitReferences(ty_unit);
             zcu.deleteUnitCompileLogs(ty_unit);
-            if (zcu.failed_analysis.fetchSwapRemove(ty_unit)) |kv| {
-                kv.value.destroy(gpa);
-            }
-            _ = zcu.transitive_failed_analysis.swapRemove(ty_unit);
+            if (zcu.clearAnalysisFailures(ty_unit)) |msg| msg.destroy(gpa);
             ip.removeDependenciesForDepender(gpa, ty_unit);
         }
         try pt.addDependency(ty_unit, .{ .nav_val = nav_id });
-        if (new_failed) try zcu.transitive_failed_analysis.put(gpa, ty_unit, {});
+        if (new_failed) try zcu.putTransitiveFailed(ty_unit);
         if (ty_was_outdated) try zcu.markDependeeOutdated(.marked_po, .{ .nav_ty = nav_id });
     }
 
@@ -1094,16 +1183,14 @@ fn analyzeNavVal(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu.CompileErr
     const zir = file.zir.?;
     const zir_decl = zir.getDeclaration(inst_resolved.inst);
 
-    try zcu.analysis_in_progress.putNoClobber(gpa, anal_unit, {});
-    errdefer _ = zcu.analysis_in_progress.swapRemove(anal_unit);
+    try zcu.aipPut(gpa, anal_unit);
+    errdefer zcu.aipRemove(anal_unit);
 
     // If there's no type body, we are also resolving the type here.
     if (zir_decl.type_body == null) {
-        try zcu.analysis_in_progress.putNoClobber(gpa, .wrap(.{ .nav_ty = nav_id }), {});
+        try zcu.aipPut(gpa, .wrap(.{ .nav_ty = nav_id }));
     }
-    errdefer if (zir_decl.type_body == null) {
-        _ = zcu.analysis_in_progress.swapRemove(.wrap(.{ .nav_ty = nav_id }));
-    };
+    errdefer if (zir_decl.type_body == null) zcu.aipRemove(.wrap(.{ .nav_ty = nav_id }));
 
     var analysis_arena: std.heap.ArenaAllocator = .init(gpa);
     defer analysis_arena.deinit();
@@ -1313,16 +1400,37 @@ fn analyzeNavVal(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu.CompileErr
     });
 
     // Mark the unit as completed before evaluating the export!
-    assert(zcu.analysis_in_progress.swapRemove(anal_unit));
+    zcu.aipRemove(anal_unit);
     if (zir_decl.type_body == null) {
-        assert(zcu.analysis_in_progress.swapRemove(.wrap(.{ .nav_ty = nav_id })));
+        zcu.aipRemove(.wrap(.{ .nav_ty = nav_id }));
     }
 
     if (zir_decl.linkage == .@"export") {
         const export_src = block.src(.{ .token_offset = @enumFromInt(@intFromBool(zir_decl.is_pub)) });
         const name_slice = zir.nullTerminatedString(zir_decl.name);
         const name_ip = try ip.getOrPutString(gpa, pt.tid, name_slice, .no_embedded_nulls);
-        try sema.analyzeExport(&block, export_src, .{ .name = name_ip }, nav_id);
+        // `analyzeExport` may trigger a retry-loop (validateExternType /
+        // ensureNavResolved on aliased nav), but we have already committed
+        // (status .fully_resolved). Swallow and append the export directly
+        // so it is not silently lost; validation will be retried on the
+        // exported nav's own analysis.
+        sema.analyzeExport(&block, export_src, .{ .name = name_ip }, nav_id) catch |err| switch (err) {
+            error.OutOfMemory => return error.OutOfMemory,
+            error.ComptimeReturn, error.ComptimeBreak => unreachable,
+            error.AnalysisFail => {
+                if (Zcu.tls_retry_loop != null) {
+                    Zcu.tls_retry_loop = null;
+                    try sema.exports.append(gpa, .{
+                        .opts = .{ .name = name_ip },
+                        .src = export_src,
+                        .exported = .{ .nav = nav_id },
+                        .status = .in_progress,
+                    });
+                    if (ip.isFuncBody(nav_val.toIntern()))
+                        try zcu.ensureFuncBodyAnalysisQueued(nav_val.toIntern());
+                } else return error.AnalysisFail;
+            },
+        };
     }
 
     try sema.flushExports();
@@ -1330,7 +1438,21 @@ fn analyzeNavVal(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu.CompileErr
     queue_codegen: {
         if (!queue_linker_work) break :queue_codegen;
 
-        if (!try nav_ty.hasRuntimeBitsSema(pt)) {
+        // `hasRuntimeBitsSema` may trigger a retry-loop after we have already
+        // committed (status .fully_resolved + exports flushed); a re-queue would
+        // be a no-op. Swallow the retry here and queue the link_nav so codegen
+        // sees the export's definition.
+        const has_rt_bits = nav_ty.hasRuntimeBitsSema(pt) catch |err| switch (err) {
+            error.OutOfMemory => return error.OutOfMemory,
+            error.AnalysisFail => blk: {
+                if (Zcu.tls_retry_loop != null) {
+                    Zcu.tls_retry_loop = null;
+                    break :blk true;
+                }
+                return error.AnalysisFail;
+            },
+        };
+        if (!has_rt_bits) {
             if (zcu.comp.config.use_llvm) break :queue_codegen;
             if (file.mod.?.strip) break :queue_codegen;
         }
@@ -1359,7 +1481,28 @@ pub fn ensureNavTypeUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zc
 
     log.debug("ensureNavTypeUpToDate {f}", .{zcu.fmtAnalUnit(anal_unit)});
 
-    assert(!zcu.analysis_in_progress.contains(anal_unit));
+    if (zcu.parallel_sema and !zcu.comp.incremental) switch (nav.status) {
+        .fully_resolved, .type_resolved => return,
+        .unresolved => {},
+    };
+
+    claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
+        .claimed => break :claim,
+        .recursed => return error.AnalysisFail,
+        .done => {
+            if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
+            switch (ip.getNav(nav_id).status) {
+                .fully_resolved, .type_resolved => return,
+                .unresolved => continue :claim,
+            }
+        },
+    };
+    defer zcu.releaseClaim(anal_unit);
+
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
+    if (!zcu.parallel_sema) assert(!zcu.analysis_in_progress.contains(anal_unit));
 
     const type_resolved_by_value: bool = from_val: {
         const analysis = nav.analysis orelse break :from_val false;
@@ -1382,11 +1525,10 @@ pub fn ensureNavTypeUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zc
     // result in over-analysis if analysis occurs in a poor order; we do our best to avoid this by
     // carefully choosing which units to re-analyze. See `Zcu.findOutdatedToAnalyze`.
 
-    const was_outdated = zcu.outdated.swapRemove(anal_unit) or
-        zcu.potentially_outdated.swapRemove(anal_unit);
+    const was_outdated = zcu.comp.incremental and
+        (zcu.outdated.swapRemove(anal_unit) or zcu.potentially_outdated.swapRemove(anal_unit));
 
-    const prev_failed = zcu.failed_analysis.contains(anal_unit) or
-        zcu.transitive_failed_analysis.contains(anal_unit);
+    const prev_failed = zcu.anyAnalysisFailed(anal_unit);
 
     if (was_outdated) {
         dev.check(.incremental);
@@ -1394,10 +1536,7 @@ pub fn ensureNavTypeUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zc
         zcu.deleteUnitExports(anal_unit);
         zcu.deleteUnitReferences(anal_unit);
         zcu.deleteUnitCompileLogs(anal_unit);
-        if (zcu.failed_analysis.fetchSwapRemove(anal_unit)) |kv| {
-            kv.value.destroy(gpa);
-        }
-        _ = zcu.transitive_failed_analysis.swapRemove(anal_unit);
+        if (zcu.clearAnalysisFailures(anal_unit)) |msg| msg.destroy(gpa);
         ip.removeDependenciesForDepender(gpa, anal_unit);
     } else {
         // We can trust the current information about this unit.
@@ -1425,10 +1564,11 @@ pub fn ensureNavTypeUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zc
         };
     } else |err| switch (err) {
         error.AnalysisFail => res: {
-            if (!zcu.failed_analysis.contains(anal_unit)) {
+            if (Zcu.tls_retry_loop != null) return error.AnalysisFail;
+            {
                 // If this unit caused the error, it would have an entry in `failed_analysis`.
                 // Since it does not, this must be a transitive failure.
-                try zcu.transitive_failed_analysis.put(gpa, anal_unit, {});
+                try zcu.markTransitiveFailed(anal_unit);
                 log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
             }
             break :res .{ !prev_failed, true };
@@ -1477,8 +1617,8 @@ fn analyzeNavType(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu.CompileEr
     const file = zcu.fileByIndex(inst_resolved.file);
     const zir = file.zir.?;
 
-    try zcu.analysis_in_progress.putNoClobber(gpa, anal_unit, {});
-    defer assert(zcu.analysis_in_progress.swapRemove(anal_unit));
+    try zcu.aipPut(gpa, anal_unit);
+    defer zcu.aipRemove(anal_unit);
 
     const zir_decl = zir.getDeclaration(inst_resolved.inst);
     const type_body = zir_decl.type_body.?;
@@ -1595,22 +1735,54 @@ pub fn ensureFuncBodyUpToDate(pt: Zcu.PerThread, func_index: InternPool.Index) Z
     const gpa = zcu.gpa;
     const ip = &zcu.intern_pool;
 
-    _ = zcu.func_body_analysis_queued.swapRemove(func_index);
-
     const anal_unit: AnalUnit = .wrap(.{ .func = func_index });
 
     log.debug("ensureFuncBodyUpToDate {f}", .{zcu.fmtAnalUnit(anal_unit)});
 
-    assert(!zcu.analysis_in_progress.contains(anal_unit));
-
     const func = zcu.funcInfo(func_index);
 
     assert(func.ty == func.uncoerced_ty); // analyze the body of the original function, not a coerced one
 
-    const was_outdated = zcu.outdated.swapRemove(anal_unit) or
-        zcu.potentially_outdated.swapRemove(anal_unit);
+    if (zcu.parallel_sema and !zcu.comp.incremental) fast: {
+        const a = func.analysisUnordered(ip);
+        if (!a.is_analyzed) break :fast;
+        if (a.inferred_error_set and func.resolvedErrorSetUnordered(ip) == .none) break :fast;
+        // is_analyzed is set at the start of analysis; the body is finished
+        // once the IES (if inferred) has been resolved past `.none`.
+        return;
+    }
 
-    const prev_failed = zcu.failed_analysis.contains(anal_unit) or zcu.transitive_failed_analysis.contains(anal_unit);
+    // `claimOrWait` self-locks `unit_claims_mutex`; we only take the global
+    // `sema_lock` after the claim succeeds, so the (very hot) entry path no
+    // longer contends on `sema_lock`.
+    claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
+        .claimed => break :claim,
+        .recursed => return error.AnalysisFail,
+        .done => {
+            if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
+            // The previous holder may have released its claim via a retry-abort
+            // (yield-and-requeue) without actually finishing the body. Re-check
+            // the analyzed/IES status and loop back to claim if not.
+            const a = func.analysisUnordered(ip);
+            if (a.is_analyzed and
+                (!a.inferred_error_set or func.resolvedErrorSetUnordered(ip) != .none)) return;
+            continue :claim;
+        },
+    };
+    defer zcu.releaseClaim(anal_unit);
+
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
+
+    if (!zcu.parallel_sema) _ = zcu.func_body_analysis_queued.swapRemove(func_index);
+
+    if (!zcu.parallel_sema) assert(!zcu.analysis_in_progress.contains(anal_unit));
+
+    const was_outdated = zcu.comp.incremental and
+        (zcu.outdated.swapRemove(anal_unit) or zcu.potentially_outdated.swapRemove(anal_unit));
+
+    const prev_failed = zcu.anyAnalysisFailed(anal_unit);
 
     if (was_outdated) {
         dev.check(.incremental);
@@ -1618,10 +1790,7 @@ pub fn ensureFuncBodyUpToDate(pt: Zcu.PerThread, func_index: InternPool.Index) Z
         zcu.deleteUnitExports(anal_unit);
         zcu.deleteUnitReferences(anal_unit);
         zcu.deleteUnitCompileLogs(anal_unit);
-        if (zcu.failed_analysis.fetchSwapRemove(anal_unit)) |kv| {
-            kv.value.destroy(gpa);
-        }
-        _ = zcu.transitive_failed_analysis.swapRemove(anal_unit);
+        if (zcu.clearAnalysisFailures(anal_unit)) |msg| msg.destroy(gpa);
     } else {
         // We can trust the current information about this function.
         if (prev_failed) {
@@ -1647,12 +1816,15 @@ pub fn ensureFuncBodyUpToDate(pt: Zcu.PerThread, func_index: InternPool.Index) Z
         .{ prev_failed or result.ies_outdated, false }
     else |err| switch (err) {
         error.AnalysisFail => res: {
-            if (!zcu.failed_analysis.contains(anal_unit)) {
-                // If this function caused the error, it would have an entry in `failed_analysis`.
-                // Since it does not, this must be a transitive failure.
-                try zcu.transitive_failed_analysis.put(gpa, anal_unit, {});
-                log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
+            if (Zcu.tls_retry_loop != null) {
+                // `setAnalyzed` ran at the start of `analyzeFnBodyInner`; undo
+                // it so the re-queued attempt isn't short-circuited by the
+                // is_analyzed fast-path before codegen_func is queued.
+                func.clearAnalyzed(ip);
+                return error.AnalysisFail;
             }
+            try zcu.markTransitiveFailed(anal_unit);
+            log.debug("mark transitive analysis failure for {f}", .{zcu.fmtAnalUnit(anal_unit)});
             // We consider the IES to be outdated if the function previously succeeded analysis; in this case,
             // we need to re-analyze dependants to ensure they hit a transitive error here, rather than reporting
             // a different error later (which may now be invalid).
@@ -1814,6 +1986,7 @@ fn createFileRootStruct(
 
     wip_ty.setName(ip, try file.internFullyQualifiedName(pt), .none);
     ip.namespacePtr(namespace_index).owner_type = wip_ty.index;
+    _ = wip_ty.finish(ip, namespace_index);
 
     if (zcu.comp.incremental) {
         try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = tracked_inst });
@@ -1829,7 +2002,7 @@ fn createFileRootStruct(
     }
     zcu.setFileRootType(file_index, wip_ty.index);
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return wip_ty.finish(ip, namespace_index);
+    return wip_ty.index;
 }
 
 /// Re-scan the namespace of a file's root struct type on an incremental update.
@@ -1882,7 +2055,7 @@ fn semaFile(pt: Zcu.PerThread, file_index: Zcu.File.Index) Zcu.SemaError!void {
         .parent = .none,
         .owner_type = undefined, // set in `createFileRootStruct`
         .file_scope = file_index,
-        .generation = zcu.generation,
+        .generation = zcu.generation -% 1,
     });
     const struct_ty = try pt.createFileRootStruct(file_index, new_namespace_index, false);
     errdefer zcu.intern_pool.remove(pt.tid, struct_ty);
@@ -2365,6 +2538,11 @@ pub fn embedFile(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
+    // `embed_table` and `EmbedFile` allocation are shared state accessed
+    // from the carve-out under parallel Sema.
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+
     const opt_mod: ?*Module = m: {
         if (mem.eql(u8, import_string, "std")) break :m zcu.std_mod;
         if (mem.eql(u8, import_string, "root")) break :m zcu.root_mod;
@@ -2564,6 +2742,12 @@ pub fn scanNamespace(
     const gpa = zcu.gpa;
     const namespace = zcu.namespacePtr(namespace_index);
 
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+    // Another thread may have already scanned this namespace (e.g. via
+    // ensureNamespaceUpToDate before the creator reached its own scan).
+    if (zcu.parallel_sema and namespace.generation == zcu.generation) return;
+
     const tracked_unit = zcu.trackUnitSema(
         Type.fromInterned(namespace.owner_type).containerTypeName(ip).toSlice(ip),
         null,
@@ -2598,12 +2782,18 @@ pub fn scanNamespace(
         const zir_index = ip.getNav(nav).analysis.?.zir_index;
         existing_by_inst.putAssumeCapacityNoClobber(zir_index, .wrap(.{ .nav_val = nav }));
         // This test will be re-added to `test_functions` later on if it's still alive. Remove it for now.
+        zcu.test_functions_mutex.lock();
         _ = zcu.test_functions.swapRemove(nav);
+        zcu.test_functions_mutex.unlock();
     }
 
     var seen_decls: std.AutoHashMapUnmanaged(InternPool.NullTerminatedString, void) = .empty;
     defer seen_decls.deinit(gpa);
 
+    // Mark stale before clearing so a concurrent fast-path reader cannot
+    // observe gen==current with an emptied map during a re-scan.
+    @atomicStore(u32, &namespace.generation, zcu.generation -% 1, .release);
+
     namespace.pub_decls.clearRetainingCapacity();
     namespace.priv_decls.clearRetainingCapacity();
     namespace.comptime_decls.clearRetainingCapacity();
@@ -2623,6 +2813,11 @@ pub fn scanNamespace(
     for (decls) |decl_inst| {
         try scan_decl_iter.scanDecl(decl_inst);
     }
+    // Mark the namespace fully scanned while still holding `sema_lock` so a
+    // concurrent `ensureNamespaceUpToDate` doesn't observe a current
+    // generation with empty decls. Release store pairs with the acquire in
+    // the lock-free fast-path.
+    @atomicStore(u32, &namespace.generation, zcu.generation, .release);
 }
 
 const ScanDeclIter = struct {
@@ -2717,6 +2912,8 @@ const ScanDeclIter = struct {
                 if (existing_unit == null) {
                     // For a `comptime` declaration, whether to analyze is based solely on whether the unit
                     // is outdated. So, add this fresh one to `outdated` and `outdated_ready`.
+                    zcu.outdated_mutex.lock();
+                    defer zcu.outdated_mutex.unlock();
                     try zcu.outdated.ensureUnusedCapacity(gpa, 1);
                     try zcu.outdated_ready.ensureUnusedCapacity(gpa, 1);
                     zcu.outdated.putAssumeCapacityNoClobber(unit, 0);
@@ -2757,7 +2954,11 @@ const ScanDeclIter = struct {
                                 if (std.mem.indexOf(u8, fqn_slice, test_filter) != null) break;
                             } else break :a false;
                         }
-                        try zcu.test_functions.put(gpa, nav, {});
+                        {
+                            zcu.test_functions_mutex.lock();
+                            defer zcu.test_functions_mutex.unlock();
+                            try zcu.test_functions.put(gpa, nav, {});
+                        }
                         break :a true;
                     },
                     .@"const", .@"var" => a: {
@@ -2797,8 +2998,8 @@ fn analyzeFnBodyInner(pt: Zcu.PerThread, func_index: InternPool.Index) Zcu.SemaE
     const file = zcu.fileByIndex(inst_info.file);
     const zir = file.zir.?;
 
-    try zcu.analysis_in_progress.putNoClobber(gpa, anal_unit, {});
-    errdefer _ = zcu.analysis_in_progress.swapRemove(anal_unit);
+    try zcu.aipPut(gpa, anal_unit);
+    errdefer zcu.aipRemove(anal_unit);
 
     func.setAnalyzed(ip);
     if (func.analysisUnordered(ip).inferred_error_set) {
@@ -2819,7 +3020,8 @@ fn analyzeFnBodyInner(pt: Zcu.PerThread, func_index: InternPool.Index) Zcu.SemaE
 
     const func_nav = ip.getNav(func.owner_nav);
 
-    zcu.intern_pool.removeDependenciesForDepender(gpa, anal_unit);
+    if (zcu.comp.incremental)
+        zcu.intern_pool.removeDependenciesForDepender(gpa, anal_unit);
 
     var analysis_arena = std.heap.ArenaAllocator.init(gpa);
     defer analysis_arena.deinit();
@@ -2947,10 +3149,14 @@ fn analyzeFnBodyInner(pt: Zcu.PerThread, func_index: InternPool.Index) Zcu.SemaE
     sema.error_return_trace_index_on_fn_entry = error_return_trace_index;
     inner_block.error_return_trace_index = error_return_trace_index;
 
-    sema.analyzeFnBody(&inner_block, fn_info.body) catch |err| switch (err) {
-        error.ComptimeReturn => unreachable,
-        else => |e| return e,
-    };
+    {
+        const saved_depth = if (std.process.hasNonEmptyEnvVarConstant("ZIG_NO_SEMA_CARVEOUT")) 0 else zcu.semaRelease();
+        defer zcu.semaReacquire(saved_depth);
+        sema.analyzeFnBody(&inner_block, fn_info.body) catch |err| switch (err) {
+            error.ComptimeReturn => unreachable,
+            else => |e| return e,
+        };
+    }
 
     for (sema.unresolved_inferred_allocs.keys()) |ptr_inst| {
         // The lack of a resolve_inferred_alloc means that this instruction
@@ -2998,7 +3204,7 @@ fn analyzeFnBodyInner(pt: Zcu.PerThread, func_index: InternPool.Index) Zcu.SemaE
         func.setResolvedErrorSet(ip, ies.resolved);
     }
 
-    assert(zcu.analysis_in_progress.swapRemove(anal_unit));
+    zcu.aipRemove(anal_unit);
 
     // Finally we must resolve the return type and parameter types so that backends
     // have full access to type information.
@@ -3223,9 +3429,7 @@ fn processExportsInner(
             const nav = ip.getNav(nav_index);
             if (zcu.failed_codegen.contains(nav_index)) break :failed true;
             if (nav.analysis != null) {
-                const unit: AnalUnit = .wrap(.{ .nav_val = nav_index });
-                if (zcu.failed_analysis.contains(unit)) break :failed true;
-                if (zcu.transitive_failed_analysis.contains(unit)) break :failed true;
+                if (zcu.anyAnalysisFailed(.wrap(.{ .nav_val = nav_index }))) break :failed true;
             }
             const val = switch (nav.status) {
                 .unresolved, .type_resolved => break :failed true,
@@ -3233,9 +3437,7 @@ fn processExportsInner(
             };
             // If the value is a function, we also need to check if that function succeeded analysis.
             if (val.typeOf(zcu).zigTypeTag(zcu) == .@"fn") {
-                const func_unit = AnalUnit.wrap(.{ .func = val.toIntern() });
-                if (zcu.failed_analysis.contains(func_unit)) break :failed true;
-                if (zcu.transitive_failed_analysis.contains(func_unit)) break :failed true;
+                if (zcu.anyAnalysisFailed(.wrap(.{ .func = val.toIntern() }))) break :failed true;
             }
             break :failed false;
         }) {
@@ -3278,8 +3480,7 @@ pub fn populateTestFunctions(pt: Zcu.PerThread) Allocator.Error!void {
         Zcu.Namespace.NameAdapter{ .zcu = zcu },
     ).?;
     // ...but it might not be populated, so let's check that!
-    if (zcu.failed_analysis.contains(.wrap(.{ .nav_val = nav_index })) or
-        zcu.transitive_failed_analysis.contains(.wrap(.{ .nav_val = nav_index })) or
+    if (zcu.anyAnalysisFailed(.wrap(.{ .nav_val = nav_index })) or
         ip.getNav(nav_index).status != .fully_resolved)
     {
         // The value of `builtin.test_functions` was either never referenced, or failed analysis.
@@ -3305,12 +3506,7 @@ pub fn populateTestFunctions(pt: Zcu.PerThread) Allocator.Error!void {
             {
                 // The test declaration might have failed; if that's the case, just return, as we'll
                 // be emitting a compile error anyway.
-                const anal_unit: AnalUnit = .wrap(.{ .nav_val = test_nav_index });
-                if (zcu.failed_analysis.contains(anal_unit) or
-                    zcu.transitive_failed_analysis.contains(anal_unit))
-                {
-                    return;
-                }
+                if (zcu.anyAnalysisFailed(.wrap(.{ .nav_val = test_nav_index }))) return;
             }
 
             const test_nav_name = test_nav.fqn;
@@ -3878,6 +4074,11 @@ pub fn ensureTypeUpToDate(pt: Zcu.PerThread, ty: InternPool.Index) Zcu.SemaError
     const gpa = zcu.gpa;
     const ip = &zcu.intern_pool;
 
+    if (zcu.parallel_sema and !zcu.comp.incremental) return ty;
+
+    zcu.semaLock();
+    defer zcu.semaUnlock();
+
     const anal_unit: AnalUnit = .wrap(.{ .type = ty });
     const outdated = zcu.outdated.swapRemove(anal_unit) or
         zcu.potentially_outdated.swapRemove(anal_unit);
@@ -3912,10 +4113,7 @@ pub fn ensureTypeUpToDate(pt: Zcu.PerThread, ty: InternPool.Index) Zcu.SemaError
     zcu.deleteUnitExports(anal_unit);
     zcu.deleteUnitReferences(anal_unit);
     zcu.deleteUnitCompileLogs(anal_unit);
-    if (zcu.failed_analysis.fetchSwapRemove(anal_unit)) |kv| {
-        kv.value.destroy(gpa);
-    }
-    _ = zcu.transitive_failed_analysis.swapRemove(anal_unit);
+    if (zcu.clearAnalysisFailures(anal_unit)) |msg| msg.destroy(gpa);
     zcu.intern_pool.removeDependenciesForDepender(gpa, anal_unit);
 
     if (zcu.comp.debugIncremental()) {
@@ -3987,8 +4185,9 @@ fn recreateStructType(
     errdefer wip_ty.cancel(ip, pt.tid);
 
     wip_ty.setName(ip, struct_obj.name, struct_obj.name_nav);
-    try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = key.zir_index });
     zcu.namespacePtr(struct_obj.namespace).owner_type = wip_ty.index;
+    const new_ty = wip_ty.finish(ip, struct_obj.namespace);
+    try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = key.zir_index });
     // No need to re-scan the namespace -- `zirStructDecl` will ultimately do that if the type is still alive.
     try zcu.comp.queueJob(.{ .resolve_type_fully = wip_ty.index });
 
@@ -4000,7 +4199,6 @@ fn recreateStructType(
     }
 
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    const new_ty = wip_ty.finish(ip, struct_obj.namespace);
     if (inst_info.inst == .main_struct_inst) {
         // This is the root type of a file! Update the reference.
         zcu.setFileRootType(inst_info.file, new_ty);
@@ -4080,8 +4278,9 @@ fn recreateUnionType(
     errdefer wip_ty.cancel(ip, pt.tid);
 
     wip_ty.setName(ip, union_obj.name, union_obj.name_nav);
-    try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = key.zir_index });
     zcu.namespacePtr(namespace_index).owner_type = wip_ty.index;
+    _ = wip_ty.finish(ip, namespace_index);
+    try pt.addDependency(.wrap(.{ .type = wip_ty.index }), .{ .src_hash = key.zir_index });
     // No need to re-scan the namespace -- `zirUnionDecl` will ultimately do that if the type is still alive.
     try zcu.comp.queueJob(.{ .resolve_type_fully = wip_ty.index });
 
@@ -4093,7 +4292,7 @@ fn recreateUnionType(
     }
 
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    return wip_ty.finish(ip, namespace_index);
+    return wip_ty.index;
 }
 
 /// This *does* call `Sema.resolveDeclaredEnum`, but errors from it are not propagated.
@@ -4192,10 +4391,10 @@ fn recreateEnumType(
     wip_ty.setName(ip, enum_obj.name, enum_obj.name_nav);
 
     zcu.namespacePtr(namespace_index).owner_type = wip_ty.index;
+    wip_ty.prepare(ip, namespace_index);
     // No need to re-scan the namespace -- `zirEnumDecl` will ultimately do that if the type is still alive.
 
     if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip_ty.index);
-    wip_ty.prepare(ip, namespace_index);
     done = true;
 
     Sema.resolveDeclaredEnum(
@@ -4230,6 +4429,10 @@ pub fn ensureNamespaceUpToDate(pt: Zcu.PerThread, namespace_index: Zcu.Namespace
     const ip = &zcu.intern_pool;
     const namespace = zcu.namespacePtr(namespace_index);
 
+    if (zcu.parallel_sema and @atomicLoad(u32, &namespace.generation, .acquire) == zcu.generation) return;
+
+    zcu.semaLock();
+    defer zcu.semaUnlock();
     if (namespace.generation == zcu.generation) return;
 
     const Container = enum { @"struct", @"union", @"enum", @"opaque" };
@@ -4495,7 +4698,6 @@ fn runCodegenInner(pt: Zcu.PerThread, func_index: InternPool.Index, air: *Air) e
     // "emit" step because LLVM does not support incremental linking. Our linker (LLD or self-hosted)
     // will just see the ZCU object file which LLVM ultimately emits.
     if (zcu.llvm_object) |llvm_object| {
-        assert(pt.tid == .main); // LLVM has a lot of shared state
         try llvm_object.updateFunc(pt, func_index, air, &liveness);
         return error.BackendDoesNotProduceMir;
     }
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index f452f8dd3967..f886f7b5e8ac 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -491,6 +491,133 @@ fn codeModel(model: std.builtin.CodeModel, target: *const std.Target) CodeModel
     };
 }
 
+pub const PartitionSet = struct {
+    objects: []Object.Ptr,
+    mutexes: []std.Thread.Mutex,
+    n: u32,
+
+    pub const Ptr = if (dev.env.supports(.llvm_backend)) *PartitionSet else noreturn;
+
+    pub fn create(arena: Allocator, comp: *Compilation, n: u32) !Ptr {
+        dev.check(.llvm_backend);
+        const n_eff = @max(1, n);
+        const ps = try arena.create(PartitionSet);
+        const objects = try arena.alloc(Object.Ptr, n_eff);
+        const mutexes = try arena.alloc(std.Thread.Mutex, n_eff);
+        @memset(mutexes, .{});
+        ps.* = .{
+            .objects = objects,
+            .mutexes = mutexes,
+            .n = n_eff,
+        };
+        for (0..n_eff) |i| {
+            ps.objects[i] = try Object.create(arena, comp, @intCast(i));
+            ps.objects[i].partition_set = ps;
+        }
+        return ps;
+    }
+
+    pub fn deinit(self: *PartitionSet) void {
+        for (self.objects) |o| o.deinit();
+        self.* = undefined;
+    }
+
+    pub fn primary(self: *PartitionSet) Object.Ptr {
+        return self.objects[0];
+    }
+
+    pub fn updateFunc(
+        self: *PartitionSet,
+        pt: Zcu.PerThread,
+        func_index: InternPool.Index,
+        air: *const Air,
+        liveness: *const ?Air.Liveness,
+    ) !void {
+        const zcu = pt.zcu;
+        const owner_nav = zcu.funcInfo(func_index).owner_nav;
+        const shard = zcu.navShard(owner_nav, self.n);
+        self.mutexes[shard].lock();
+        defer self.mutexes[shard].unlock();
+        return self.objects[shard].updateFunc(pt, func_index, air, liveness);
+    }
+
+    pub fn updateNav(self: *PartitionSet, pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) !void {
+        const shard = pt.zcu.navShard(nav_index, self.n);
+        self.mutexes[shard].lock();
+        defer self.mutexes[shard].unlock();
+        return self.objects[shard].updateNav(pt, nav_index);
+    }
+
+    pub fn updateExports(
+        self: *PartitionSet,
+        pt: Zcu.PerThread,
+        exported: Zcu.Exported,
+        export_indices: []const Zcu.Export.Index,
+    ) link.File.UpdateExportsError!void {
+        const shard: u32 = switch (exported) {
+            .nav => |nav| pt.zcu.navShard(nav, self.n),
+            .uav => 0,
+        };
+        self.mutexes[shard].lock();
+        defer self.mutexes[shard].unlock();
+        return self.objects[shard].updateExports(pt, exported, export_indices);
+    }
+
+    pub fn emit(self: *PartitionSet, pt: Zcu.PerThread, options: Object.EmitOptions) error{ LinkFailure, OutOfMemory }!void {
+        if (self.n == 1) return self.objects[0].emit(pt, options);
+        const list = options.bin_path_list orelse
+            return self.objects[0].emit(pt, options);
+        assert(list.len == self.n);
+
+        const comp = pt.zcu.comp;
+        // LLVM target registration mutates a global linked list; do it once
+        // here before fanning out so per-shard emit workers only read it.
+        if (build_options.have_llvm and comp.config.use_lib_llvm)
+            initializeLLVMTarget(comp.root_mod.resolved_target.result.cpu.arch);
+        var wg: std.Thread.WaitGroup = .{};
+        var err_flag: std.atomic.Value(u8) = .init(0); // 0=ok, 1=oom, 2=link
+        for (self.objects, 0..) |obj, i| {
+            var shard_opts = options;
+            shard_opts.bin_path = list[i];
+            shard_opts.bin_path_list = null;
+            shard_opts.asm_path = null;
+            if (i != 0) {
+                shard_opts.time_report = null;
+                shard_opts.pre_ir_path = null;
+                shard_opts.pre_bc_path = null;
+                shard_opts.post_ir_path = null;
+                shard_opts.post_bc_path = null;
+            }
+            comp.thread_pool.spawnWgId(&wg, emitShardWorker, .{ pt.zcu, obj, shard_opts, &err_flag });
+        }
+        comp.thread_pool.waitAndWork(&wg);
+        switch (err_flag.load(.monotonic)) {
+            0 => {},
+            1 => return error.OutOfMemory,
+            else => return error.LinkFailure,
+        }
+    }
+
+    fn emitShardWorker(
+        tid: usize,
+        zcu: *Zcu,
+        obj: Object.Ptr,
+        options: Object.EmitOptions,
+        err_flag: *std.atomic.Value(u8),
+    ) void {
+        const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
+        defer pt.deactivate();
+        const t0: i64 = if (std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) std.time.milliTimestamp() else 0;
+        obj.emit(pt, options) catch |err| switch (err) {
+            error.OutOfMemory => _ = err_flag.cmpxchgStrong(0, 1, .monotonic, .monotonic),
+            error.LinkFailure => _ = err_flag.cmpxchgStrong(0, 2, .monotonic, .monotonic),
+        };
+        if (std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) {
+            std.debug.print("[PHASE] shard {d} emit {d} ms\n", .{ obj.partition_id, std.time.milliTimestamp() - t0 });
+        }
+    }
+};
+
 pub const Object = struct {
     gpa: Allocator,
     builder: Builder,
@@ -550,6 +677,9 @@ pub const Object = struct {
     /// Values for `@llvm.used`.
     used: std.ArrayListUnmanaged(Builder.Constant),
 
+    partition_id: u32 = 0,
+    partition_set: ?*PartitionSet = null,
+
     const ZigStructField = struct {
         struct_ty: InternPool.Index,
         field_index: u32,
@@ -559,7 +689,7 @@ pub const Object = struct {
 
     pub const TypeMap = std.AutoHashMapUnmanaged(InternPool.Index, Builder.Type);
 
-    pub fn create(arena: Allocator, comp: *Compilation) !Ptr {
+    pub fn create(arena: Allocator, comp: *Compilation, partition_id: u32) !Ptr {
         dev.check(.llvm_backend);
         const gpa = comp.gpa;
         const target = &comp.root_mod.resolved_target.result;
@@ -643,10 +773,43 @@ pub const Object = struct {
             .null_opt_usize = .no_init,
             .struct_field_map = .{},
             .used = .{},
+            .partition_id = partition_id,
+            .partition_set = null,
         };
         return obj;
     }
 
+    pub fn isSharded(o: *const Object) bool {
+        const ps = o.partition_set orelse return false;
+        return ps.n > 1;
+    }
+
+    pub fn ownsNav(o: *const Object, zcu: *Zcu, nav: InternPool.Nav.Index) bool {
+        if (!o.isSharded()) return true;
+        return zcu.navShard(nav, o.partition_set.?.n) == o.partition_id;
+    }
+
+    /// Symbol name for an internal-linkage Nav. When sharded, the InternPool
+    /// Nav.Index is appended so distinct generic instantiations whose `fqn` is
+    /// identical (e.g. `const T = struct {...}` inside a generic function) get
+    /// distinct cross-shard symbol names that every shard agrees on; Sema is
+    /// single-threaded so the index is deterministic within one compilation.
+    fn shardedNavName(
+        o: *Object,
+        ip: *const InternPool,
+        fqn: InternPool.NullTerminatedString,
+        nav: InternPool.Nav.Index,
+    ) Allocator.Error!Builder.StrtabString {
+        const s = fqn.toSlice(ip);
+        if (!o.isSharded()) return o.builder.strtabString(s);
+        return o.builder.strtabStringFmt("{s}__N{d}", .{ s, @intFromEnum(nav) });
+    }
+
+    fn shardSuffixed(o: *Object, comptime fmt: []const u8, args: anytype) Allocator.Error!Builder.StrtabString {
+        if (!o.isSharded()) return o.builder.strtabStringFmt(fmt, args);
+        return o.builder.strtabStringFmt(fmt ++ "_s{d}", args ++ .{o.partition_id});
+    }
+
     pub fn deinit(self: *Object) void {
         const gpa = self.gpa;
         self.debug_enums.deinit(gpa);
@@ -665,8 +828,26 @@ pub const Object = struct {
     }
 
     fn genErrorNameTable(o: *Object, pt: Zcu.PerThread) Allocator.Error!void {
-        // If o.error_name_table is null, then it was not referenced by any instructions.
-        if (o.error_name_table == .none) return;
+        if (o.isSharded()) {
+            // Shard 0 owns the definition; other shards reference it as an
+            // external hidden declaration created by getErrorNameTable.
+            if (o.partition_id != 0) return;
+            if (o.error_name_table == .none) {
+                const variable_index =
+                    try o.builder.addVariable(try o.builder.strtabString("__zig_err_name_table"), .ptr, .default);
+                variable_index.setLinkage(.external, &o.builder);
+                variable_index.setVisibility(.hidden, &o.builder);
+                variable_index.setMutability(.constant, &o.builder);
+                variable_index.setAlignment(
+                    Type.slice_const_u8_sentinel_0.abiAlignment(pt.zcu).toLlvm(),
+                    &o.builder,
+                );
+                o.error_name_table = variable_index;
+            }
+        } else {
+            // If o.error_name_table is null, then it was not referenced by any instructions.
+            if (o.error_name_table == .none) return;
+        }
 
         const zcu = pt.zcu;
         const ip = &zcu.intern_pool;
@@ -716,6 +897,12 @@ pub const Object = struct {
     }
 
     fn genCmpLtErrorsLenFunction(o: *Object, pt: Zcu.PerThread) !void {
+        if (o.isSharded() and o.partition_id != 0) return;
+        if (o.isSharded()) {
+            // Shard 0 must define this even if it never referenced it locally,
+            // since other shards may declare it external hidden.
+            _ = try o.getCmpLtErrorsLenFunction(pt);
+        }
         // If there is no such function in the module, it means the source code does not need it.
         const name = o.builder.strtabStringIfExists(lt_errors_fn_name) orelse return;
         const llvm_fn = o.builder.getGlobal(name) orelse return;
@@ -741,6 +928,7 @@ pub const Object = struct {
     }
 
     fn genModuleLevelAssembly(object: *Object, pt: Zcu.PerThread) Allocator.Error!void {
+        if (object.isSharded() and object.partition_id != 0) return;
         const b = &object.builder;
         const gpa = b.gpa;
         b.module_asm.clearRetainingCapacity();
@@ -1482,7 +1670,7 @@ pub const Object = struct {
                     .sp_flags = .{
                         .Optimized = owner_mod.optimize_mode != .Debug,
                         .Definition = true,
-                        .LocalToUnit = is_internal_linkage,
+                        .LocalToUnit = is_internal_linkage and !o.isSharded(),
                     },
                 },
                 o.debug_compile_unit,
@@ -1501,7 +1689,7 @@ pub const Object = struct {
             // array of the appropriate size after the POI count is known.
 
             // Due to error "members of llvm.compiler.used must be named", this global needs a name.
-            const anon_name = try o.builder.strtabStringFmt("__sancov_gen_.{d}", .{o.used.items.len});
+            const anon_name = try o.shardSuffixed("__sancov_gen_.{d}", .{o.used.items.len});
             const counters_variable = try o.builder.addVariable(anon_name, .void, .default);
             try o.used.append(gpa, counters_variable.toConst(&o.builder));
             counters_variable.setLinkage(.private, &o.builder);
@@ -1577,7 +1765,7 @@ pub const Object = struct {
             const array_llvm_ty = try o.builder.arrayType(f.pcs.items.len, .ptr);
             const init_val = try o.builder.arrayConst(array_llvm_ty, f.pcs.items);
             // Due to error "members of llvm.compiler.used must be named", this global needs a name.
-            const anon_name = try o.builder.strtabStringFmt("__sancov_gen_.{d}", .{o.used.items.len});
+            const anon_name = try o.shardSuffixed("__sancov_gen_.{d}", .{o.used.items.len});
             const pcs_variable = try o.builder.addVariable(anon_name, array_llvm_ty, .default);
             try o.used.append(gpa, pcs_variable.toConst(&o.builder));
             pcs_variable.setLinkage(.private, &o.builder);
@@ -1619,10 +1807,31 @@ pub const Object = struct {
         const zcu = pt.zcu;
         const nav_index = switch (exported) {
             .nav => |nav| nav,
-            .uav => |uav| return updateExportedValue(self, pt, uav, export_indices),
+            .uav => |uav| {
+                if (self.isSharded() and self.partition_id != 0) return;
+                return updateExportedValue(self, pt, uav, export_indices);
+            },
         };
+        if (!self.ownsNav(zcu, nav_index)) return;
         const ip = &zcu.intern_pool;
-        const global_index = self.nav_map.get(nav_index).?;
+        const global_index = self.nav_map.get(nav_index) orelse gi: {
+            // The nav was exported but its `link_nav` / `codegen_func` job
+            // never ran (likely a post-commit retry under parallel Sema dropping
+            // a queued job). For variables we can emit late; for functions we
+            // cannot synthesise a body — skip so the missing symbol surfaces
+            // at link rather than tripping the verifier with an alias-to-decl.
+            const nav = ip.getNav(nav_index);
+            switch (ip.indexToKey(nav.status.fully_resolved.val)) {
+                .func => |f| if (f.owner_nav == nav_index) {
+                    log.warn("updateExports: function nav '{f}' not codegenned; skipping export", .{nav.fqn.fmt(ip)});
+                    return;
+                },
+                else => {},
+            }
+            log.warn("updateExports: nav '{f}' not in nav_map; emitting late", .{nav.fqn.fmt(ip)});
+            try self.updateNav(pt, nav_index);
+            break :gi self.nav_map.get(nav_index).?;
+        };
         const comp = zcu.comp;
 
         // If we're on COFF and linking with LLD, the linker cares about our exports to determine the subsystem in use.
@@ -1649,12 +1858,17 @@ pub const Object = struct {
         if (export_indices.len != 0) {
             return updateExportedGlobal(self, zcu, global_index, export_indices);
         } else {
-            const fqn = try self.builder.strtabString(ip.getNav(nav_index).fqn.toSlice(ip));
+            const fqn = try self.shardedNavName(ip, ip.getNav(nav_index).fqn, nav_index);
             try global_index.rename(fqn, &self.builder);
-            global_index.setLinkage(.internal, &self.builder);
+            if (self.isSharded()) {
+                global_index.setLinkage(.external, &self.builder);
+                global_index.setVisibility(.hidden, &self.builder);
+            } else {
+                global_index.setLinkage(.internal, &self.builder);
+                global_index.setUnnamedAddr(.unnamed_addr, &self.builder);
+            }
             if (comp.config.dll_export_fns)
                 global_index.setDllStorageClass(.default, &self.builder);
-            global_index.setUnnamedAddr(.unnamed_addr, &self.builder);
         }
     }
 
@@ -1668,16 +1882,23 @@ pub const Object = struct {
         const gpa = zcu.gpa;
         const ip = &zcu.intern_pool;
         const main_exp_name = try o.builder.strtabString(export_indices[0].ptr(zcu).opts.name.toSlice(ip));
+        // When sharded, other shards reference this uav as `__anon_{ip_index}`
+        // (linkonce_odr) so the linker coalesces to one address. Keep that
+        // name on the definition and expose every export as an alias to it.
+        const def_name = if (o.isSharded())
+            try o.builder.strtabStringFmt("__anon_{d}", .{@intFromEnum(exported_value)})
+        else
+            main_exp_name;
         const global_index = i: {
             const gop = try o.uav_map.getOrPut(gpa, exported_value);
             if (gop.found_existing) {
                 const global_index = gop.value_ptr.*;
-                try global_index.rename(main_exp_name, &o.builder);
+                if (!o.isSharded()) try global_index.rename(def_name, &o.builder);
                 break :i global_index;
             }
             const llvm_addr_space = toLlvmAddressSpace(.generic, o.target);
             const variable_index = try o.builder.addVariable(
-                main_exp_name,
+                def_name,
                 try o.lowerType(pt, Type.fromInterned(ip.typeOf(exported_value))),
                 llvm_addr_space,
             );
@@ -1689,8 +1910,33 @@ pub const Object = struct {
                 error.CodegenFail => return error.AnalysisFail,
             };
             try variable_index.setInitializer(init_val, &o.builder);
+            if (o.isSharded()) {
+                variable_index.setLinkage(.linkonce_odr, &o.builder);
+                variable_index.setVisibility(.hidden, &o.builder);
+                variable_index.setMutability(.constant, &o.builder);
+            }
             break :i global_index;
         };
+        if (o.isSharded()) {
+            // Definition keeps __anon_{idx} linkonce_odr; export names are aliases.
+            for (export_indices) |export_idx| {
+                const exp = export_idx.ptr(zcu);
+                const exp_name = try o.builder.strtabString(exp.opts.name.toSlice(ip));
+                const alias_index = try o.builder.addAlias(.empty, global_index.typeOf(&o.builder), .default, global_index.toConst());
+                try alias_index.rename(exp_name, &o.builder);
+                const ag = alias_index.ptrConst(&o.builder).global;
+                ag.setLinkage(switch (exp.opts.linkage) {
+                    .internal => unreachable,
+                    .strong => .external,
+                    .weak => .weak_odr,
+                    .link_once => .linkonce_odr,
+                }, &o.builder);
+                ag.setVisibility(.fromSymbolVisibility(exp.opts.visibility), &o.builder);
+                if (zcu.comp.config.dll_export_fns)
+                    ag.setDllStorageClass(.dllexport, &o.builder);
+            }
+            return;
+        }
         return updateExportedGlobal(o, zcu, global_index, export_indices);
     }
 
@@ -1704,6 +1950,65 @@ pub const Object = struct {
         const ip = &zcu.intern_pool;
         const first_export = export_indices[0].ptr(zcu);
 
+        if (o.isSharded()) {
+            // Other shards reference this definition by its fqn, so it must
+            // keep that name. Expose each export name as an alias instead.
+            global_index.setUnnamedAddr(.default, &o.builder);
+            if (first_export.opts.section.toSlice(ip)) |section|
+                switch (global_index.ptrConst(&o.builder).kind) {
+                    .variable => |impl_index| impl_index.setSection(
+                        try o.builder.string(section),
+                        &o.builder,
+                    ),
+                    .function, .alias, .replaced => {},
+                };
+            for (export_indices) |export_idx| {
+                const exp = export_idx.ptr(zcu);
+                const exp_name = try o.builder.strtabString(exp.opts.name.toSlice(ip));
+                if (o.builder.getGlobal(exp_name)) |existing| {
+                    switch (existing.ptrConst(&o.builder).kind) {
+                        .alias => |alias| {
+                            alias.setAliasee(global_index.toConst(), &o.builder);
+                            const ag = alias.ptrConst(&o.builder).global;
+                            ag.setLinkage(switch (exp.opts.linkage) {
+                                .internal => unreachable,
+                                .strong => .external,
+                                .weak => .weak_odr,
+                                .link_once => .linkonce_odr,
+                            }, &o.builder);
+                            ag.setVisibility(.fromSymbolVisibility(exp.opts.visibility), &o.builder);
+                            if (comp.config.dll_export_fns)
+                                ag.setDllStorageClass(.dllexport, &o.builder);
+                            continue;
+                        },
+                        .variable, .function => {
+                            try existing.rename(.empty, &o.builder);
+                            try existing.replace(global_index, &o.builder);
+                        },
+                        .replaced => unreachable,
+                    }
+                }
+                const alias_index = try o.builder.addAlias(
+                    .empty,
+                    global_index.typeOf(&o.builder),
+                    .default,
+                    global_index.toConst(),
+                );
+                try alias_index.rename(exp_name, &o.builder);
+                const ag = alias_index.ptrConst(&o.builder).global;
+                ag.setLinkage(switch (exp.opts.linkage) {
+                    .internal => unreachable,
+                    .strong => .external,
+                    .weak => .weak_odr,
+                    .link_once => .linkonce_odr,
+                }, &o.builder);
+                ag.setVisibility(.fromSymbolVisibility(exp.opts.visibility), &o.builder);
+                if (comp.config.dll_export_fns)
+                    ag.setDllStorageClass(.dllexport, &o.builder);
+            }
+            return;
+        }
+
         // We will rename this global to have a name matching `first_export`.
         // Successive exports become aliases.
         // If the first export name already exists, then there is a corresponding
@@ -1918,6 +2223,7 @@ pub const Object = struct {
             .pointer => {
                 // Normalize everything that the debug info does not represent.
                 const ptr_info = ty.ptrInfo(zcu);
+                const child_unresolved = !Type.fromInterned(ptr_info.child).eagerResolved(zcu);
 
                 if (ptr_info.sentinel != .none or
                     ptr_info.flags.address_space != .generic or
@@ -1928,10 +2234,11 @@ pub const Object = struct {
                     ptr_info.flags.is_const or
                     ptr_info.flags.is_volatile or
                     ptr_info.flags.size == .many or ptr_info.flags.size == .c or
+                    child_unresolved or
                     !Type.fromInterned(ptr_info.child).hasRuntimeBitsIgnoreComptime(zcu))
                 {
                     const bland_ptr_ty = try pt.ptrType(.{
-                        .child = if (!Type.fromInterned(ptr_info.child).hasRuntimeBitsIgnoreComptime(zcu))
+                        .child = if (child_unresolved or !Type.fromInterned(ptr_info.child).hasRuntimeBitsIgnoreComptime(zcu))
                             .anyopaque_type
                         else
                             ptr_info.child,
@@ -2070,14 +2377,15 @@ pub const Object = struct {
                 return debug_opaque_type;
             },
             .array => {
+                const arr_safe = ty.eagerResolved(zcu);
                 const debug_array_type = try o.builder.debugArrayType(
                     .none, // Name
                     .none, // File
                     .none, // Scope
                     0, // Line
                     try o.lowerDebugType(pt, ty.childType(zcu)),
-                    ty.abiSize(zcu) * 8,
-                    (ty.abiAlignment(zcu).toByteUnits() orelse 0) * 8,
+                    if (arr_safe) ty.abiSize(zcu) * 8 else 0,
+                    if (arr_safe) (ty.abiAlignment(zcu).toByteUnits() orelse 0) * 8 else 0,
                     try o.builder.metadataTuple(&.{
                         try o.builder.debugSubrange(
                             try o.builder.metadataConstant(try o.builder.intConst(.i64, 0)),
@@ -2136,7 +2444,7 @@ pub const Object = struct {
                 const name = try o.allocTypeName(pt, ty);
                 defer gpa.free(name);
                 const child_ty = ty.optionalChild(zcu);
-                if (!child_ty.hasRuntimeBitsIgnoreComptime(zcu)) {
+                if (!child_ty.eagerResolved(zcu) or !child_ty.hasRuntimeBitsIgnoreComptime(zcu)) {
                     const debug_bool_type = try o.builder.debugBoolType(
                         try o.builder.metadataString(name),
                         8,
@@ -2215,7 +2523,7 @@ pub const Object = struct {
             },
             .error_union => {
                 const payload_ty = ty.errorUnionPayload(zcu);
-                if (!payload_ty.hasRuntimeBitsIgnoreComptime(zcu)) {
+                if (!payload_ty.eagerResolved(zcu) or !payload_ty.hasRuntimeBitsIgnoreComptime(zcu)) {
                     // TODO: Maybe remove?
                     const debug_error_union_type = try o.lowerDebugType(pt, Type.anyerror);
                     try o.debug_type_map.put(gpa, ty, debug_error_union_type);
@@ -2324,7 +2632,12 @@ pub const Object = struct {
 
                         const debug_fwd_ref = try o.builder.debugForwardReference();
 
+                        var any_unresolved = false;
                         for (tuple.types.get(ip), tuple.values.get(ip), 0..) |field_ty, field_val, i| {
+                            if (!Type.fromInterned(field_ty).eagerResolved(zcu)) {
+                                any_unresolved = true;
+                                continue;
+                            }
                             if (field_val != .none or !Type.fromInterned(field_ty).hasRuntimeBits(zcu)) continue;
 
                             const field_size = Type.fromInterned(field_ty).abiSize(zcu);
@@ -2353,8 +2666,8 @@ pub const Object = struct {
                             o.debug_compile_unit, // Scope
                             0, // Line
                             .none, // Underlying type
-                            ty.abiSize(zcu) * 8,
-                            (ty.abiAlignment(zcu).toByteUnits() orelse 0) * 8,
+                            if (any_unresolved) 0 else ty.abiSize(zcu) * 8,
+                            if (any_unresolved) 0 else (ty.abiAlignment(zcu).toByteUnits() orelse 0) * 8,
                             try o.builder.metadataTuple(fields.items),
                         );
 
@@ -2364,14 +2677,12 @@ pub const Object = struct {
                         return debug_struct_type;
                     },
                     .struct_type => {
-                        if (!ip.loadStructType(ty.toIntern()).haveFieldTypes(ip)) {
-                            // This can happen if a struct type makes it all the way to
-                            // flush() without ever being instantiated or referenced (even
-                            // via pointer). The only reason we are hearing about it now is
-                            // that it is being used as a namespace to put other debug types
-                            // into. Therefore we can satisfy this by making an empty namespace,
-                            // rather than changing the frontend to unnecessarily resolve the
-                            // struct field types.
+                        if (!ty.eagerResolved(zcu)) {
+                            // Reachable when a struct is only reachable via pointer
+                            // chains (so the frontend didn't fully resolve it) or
+                            // under parallel Sema when a concurrent worker hasn't
+                            // finished resolving it yet. Emit an empty namespace
+                            // type so debug info stays self-consistent.
                             const debug_struct_type = try o.makeEmptyNamespaceDebugType(pt, ty);
                             try o.debug_type_map.put(gpa, ty, debug_struct_type);
                             return debug_struct_type;
@@ -2402,6 +2713,7 @@ pub const Object = struct {
                 var it = struct_type.iterateRuntimeOrder(ip);
                 while (it.next()) |field_index| {
                     const field_ty = Type.fromInterned(struct_type.field_types.get(ip)[field_index]);
+                    if (!field_ty.eagerResolved(zcu)) continue;
                     if (!field_ty.hasRuntimeBitsIgnoreComptime(zcu)) continue;
                     const field_size = field_ty.abiSize(zcu);
                     const field_align = ty.fieldAlignment(field_index, zcu);
@@ -2444,9 +2756,8 @@ pub const Object = struct {
                 defer gpa.free(name);
 
                 const union_type = ip.loadUnionType(ty.toIntern());
-                if (!union_type.haveFieldTypes(ip) or
-                    !ty.hasRuntimeBitsIgnoreComptime(zcu) or
-                    !union_type.haveLayout(ip))
+                if (!ty.eagerResolved(zcu) or
+                    !ty.hasRuntimeBitsIgnoreComptime(zcu))
                 {
                     const debug_union_type = try o.makeEmptyNamespaceDebugType(pt, ty);
                     try o.debug_type_map.put(gpa, ty, debug_union_type);
@@ -2495,6 +2806,7 @@ pub const Object = struct {
 
                 for (0..tag_type.names.len) |field_index| {
                     const field_ty = union_type.field_types.get(ip)[field_index];
+                    if (!Type.fromInterned(field_ty).eagerResolved(zcu)) continue;
                     if (!Type.fromInterned(field_ty).hasRuntimeBitsIgnoreComptime(zcu)) continue;
 
                     const field_size = Type.fromInterned(field_ty).abiSize(zcu);
@@ -2610,7 +2922,9 @@ pub const Object = struct {
                 try debug_param_types.ensureUnusedCapacity(3 + fn_info.param_types.len);
 
                 // Return type goes first.
-                if (Type.fromInterned(fn_info.return_type).hasRuntimeBitsIgnoreComptime(zcu)) {
+                if (Type.fromInterned(fn_info.return_type).eagerResolved(zcu) and
+                    Type.fromInterned(fn_info.return_type).hasRuntimeBitsIgnoreComptime(zcu))
+                {
                     const sret = firstParamSRet(fn_info, zcu, target);
                     const ret_ty = if (sret) Type.void else Type.fromInterned(fn_info.return_type);
                     debug_param_types.appendAssumeCapacity(try o.lowerDebugType(pt, ret_ty));
@@ -2630,6 +2944,7 @@ pub const Object = struct {
 
                 for (0..fn_info.param_types.len) |i| {
                     const param_ty = Type.fromInterned(fn_info.param_types.get(ip)[i]);
+                    if (!param_ty.eagerResolved(zcu)) continue;
                     if (!param_ty.hasRuntimeBitsIgnoreComptime(zcu)) continue;
 
                     if (isByRef(param_ty, zcu)) {
@@ -2725,9 +3040,22 @@ pub const Object = struct {
             .{ true, @"extern".lib_name }
         else
             .{ false, .none };
+        const sym_name = if (is_extern)
+            try o.builder.strtabString(nav.name.toSlice(ip))
+        else
+            try o.shardedNavName(ip, nav.fqn, nav_index);
+        // Multiple Navs can map to the same extern symbol name (e.g. two
+        // `extern "c" fn powf` declarations); reuse the existing declaration
+        // so the second one isn't silently renamed to `name.2`.
+        if (is_extern) if (o.builder.getGlobal(sym_name)) |existing| {
+            if (existing.ptrConst(&o.builder).kind == .function) {
+                gop.value_ptr.* = existing;
+                return existing.ptrConst(&o.builder).kind.function;
+            }
+        };
         const function_index = try o.builder.addFunction(
             try o.lowerType(pt, ty),
-            try o.builder.strtabString((if (is_extern) nav.name else nav.fqn).toSlice(ip)),
+            sym_name,
             toLlvmAddressSpace(nav.getAddrspace(), target),
         );
         gop.value_ptr.* = function_index.ptrConst(&o.builder).global;
@@ -2736,7 +3064,12 @@ pub const Object = struct {
         defer attributes.deinit(&o.builder);
 
         if (!is_extern) {
-            function_index.setLinkage(.internal, &o.builder);
+            if (o.isSharded()) {
+                function_index.setLinkage(.external, &o.builder);
+                function_index.ptrConst(&o.builder).global.setVisibility(.hidden, &o.builder);
+            } else {
+                function_index.setLinkage(.internal, &o.builder);
+            }
             function_index.setUnnamedAddr(.unnamed_addr, &o.builder);
         } else {
             if (target.cpu.arch.isWasm()) {
@@ -3002,7 +3335,16 @@ pub const Object = struct {
         gop.value_ptr.* = variable_index.ptrConst(&o.builder).global;
 
         try variable_index.setInitializer(try o.lowerValue(pt, uav), &o.builder);
-        variable_index.setLinkage(.internal, &o.builder);
+        if (o.isSharded()) {
+            // Comptime constant pointer identity is observable across generic
+            // instantiations that may live in different shards; emit one
+            // canonical definition per `__anon_{ip_index}` and let the linker
+            // coalesce duplicates so every shard sees the same address.
+            variable_index.setLinkage(.linkonce_odr, &o.builder);
+            variable_index.setVisibility(.hidden, &o.builder);
+        } else {
+            variable_index.setLinkage(.internal, &o.builder);
+        }
         variable_index.setMutability(.constant, &o.builder);
         variable_index.setUnnamedAddr(.unnamed_addr, &o.builder);
         variable_index.setAlignment(alignment.toLlvm(), &o.builder);
@@ -3032,12 +3374,19 @@ pub const Object = struct {
             .type_resolved => |r| .{ .internal, .default, r.is_threadlocal, false },
         };
 
+        const sym_name = switch (linkage) {
+            .internal => try o.shardedNavName(ip, nav.fqn, nav_index),
+            .strong, .weak => try o.builder.strtabString(nav.name.toSlice(ip)),
+            .link_once => unreachable,
+        };
+        if (linkage != .internal) if (o.builder.getGlobal(sym_name)) |existing| {
+            if (existing.ptrConst(&o.builder).kind == .variable) {
+                gop.value_ptr.* = existing;
+                return existing.ptrConst(&o.builder).kind.variable;
+            }
+        };
         const variable_index = try o.builder.addVariable(
-            try o.builder.strtabString(switch (linkage) {
-                .internal => nav.fqn,
-                .strong, .weak => nav.name,
-                .link_once => unreachable,
-            }.toSlice(ip)),
+            sym_name,
             try o.lowerType(pt, Type.fromInterned(nav.typeOf(ip))),
             toLlvmGlobalAddressSpace(nav.getAddrspace(), zcu.getTarget()),
         );
@@ -3045,7 +3394,25 @@ pub const Object = struct {
 
         // This is needed for declarations created by `@extern`.
         switch (linkage) {
-            .internal => {
+            .internal => if (o.isSharded()) {
+                // Cross-shard internal navs are addressed by fqn, so the
+                // declaration must carry the attributes the owning shard's
+                // definition will have (linkage, visibility, TLS, alignment,
+                // mutability) and must not be unnamed_addr.
+                variable_index.setLinkage(.external, &o.builder);
+                variable_index.setVisibility(.hidden, &o.builder);
+                if (is_threadlocal and !zcu.navFileScope(nav_index).mod.?.single_threaded)
+                    variable_index.setThreadLocal(.generaldynamic, &o.builder);
+                const is_const = switch (nav.status) {
+                    .unresolved => unreachable,
+                    .type_resolved => |r| r.is_const,
+                    .fully_resolved => |r| r.is_const,
+                };
+                if (is_const) variable_index.setMutability(.constant, &o.builder);
+                if (nav.getAlignment() != .none)
+                    variable_index.setAlignment(nav.getAlignment().toLlvm(), &o.builder);
+                return variable_index;
+            } else {
                 variable_index.setLinkage(.internal, &o.builder);
                 variable_index.setUnnamedAddr(.unnamed_addr, &o.builder);
             },
@@ -4474,7 +4841,12 @@ pub const Object = struct {
         defer attributes.deinit(&o.builder);
         try o.addCommonFnAttributes(&attributes, zcu.root_mod, zcu.root_mod.omit_frame_pointer);
 
-        function_index.setLinkage(.internal, &o.builder);
+        if (o.isSharded()) {
+            function_index.setLinkage(.external, &o.builder);
+            function_index.ptrConst(&o.builder).global.setVisibility(.hidden, &o.builder);
+        } else {
+            function_index.setLinkage(.internal, &o.builder);
+        }
         function_index.setCallConv(.fastcc, &o.builder);
         function_index.setAttributes(try attributes.finish(&o.builder), &o.builder);
         return function_index;
@@ -4494,7 +4866,7 @@ pub const Object = struct {
         const target = &zcu.root_mod.resolved_target.result;
         const function_index = try o.builder.addFunction(
             try o.builder.fnType(ret_ty, &.{try o.lowerType(pt, Type.fromInterned(enum_type.tag_ty))}, .normal),
-            try o.builder.strtabStringFmt("__zig_tag_name_{f}", .{enum_type.name.fmt(ip)}),
+            try o.shardSuffixed("__zig_tag_name_{f}", .{enum_type.name.fmt(ip)}),
             toLlvmAddressSpace(.generic, target),
         );
 
@@ -4583,6 +4955,7 @@ pub const NavGen = struct {
         const nav_index = ng.nav_index;
         const nav = ip.getNav(nav_index);
         const resolved = nav.status.fully_resolved;
+        assert(o.ownsNav(zcu, nav_index));
 
         const lib_name, const linkage, const visibility: Builder.Visibility, const is_threadlocal, const is_dll_import, const is_const, const init_val, const owner_nav = switch (ip.indexToKey(resolved.val)) {
             .variable => |variable| .{ .none, .internal, .default, variable.is_threadlocal, false, false, variable.init, variable.owner_nav },
@@ -4603,7 +4976,8 @@ pub const NavGen = struct {
                 .none => .no_init,
                 else => try o.lowerValue(pt, init_val),
             }, &o.builder);
-            variable_index.setVisibility(visibility, &o.builder);
+            if (!(o.isSharded() and linkage == .internal))
+                variable_index.setVisibility(visibility, &o.builder);
 
             const file_scope = zcu.navFileScopeIndex(nav_index);
             const mod = zcu.fileByIndex(file_scope).mod.?;
@@ -4623,7 +4997,7 @@ pub const NavGen = struct {
                     line_number,
                     try o.lowerDebugType(pt, ty),
                     variable_index,
-                    .{ .local = linkage == .internal },
+                    .{ .local = linkage == .internal and !o.isSharded() },
                 );
 
                 const debug_expression = try o.builder.debugExpression(&.{});
@@ -5210,7 +5584,7 @@ pub const FuncGen = struct {
                         .Optimized = mod.optimize_mode != .Debug,
                         .Definition = true,
                         // TODO: we can't know this at this point, since the function could be exported later!
-                        .LocalToUnit = true,
+                        .LocalToUnit = !o.isSharded(),
                     },
                 },
                 o.debug_compile_unit,
@@ -6481,7 +6855,7 @@ pub const FuncGen = struct {
             const table_val = try o.builder.arrayConst(table_llvm_ty, table_elems);
 
             const table_variable = try o.builder.addVariable(
-                try o.builder.strtabStringFmt("__jmptab_{d}", .{@intFromEnum(inst)}),
+                try o.shardSuffixed("__jmptab_{d}", .{@intFromEnum(inst)}),
                 table_llvm_ty,
                 .default,
             );
@@ -10429,7 +10803,7 @@ pub const FuncGen = struct {
         const target = &zcu.root_mod.resolved_target.result;
         const function_index = try o.builder.addFunction(
             try o.builder.fnType(.i1, &.{try o.lowerType(pt, Type.fromInterned(enum_type.tag_ty))}, .normal),
-            try o.builder.strtabStringFmt("__zig_is_named_enum_value_{f}", .{enum_type.name.fmt(ip)}),
+            try o.shardSuffixed("__zig_is_named_enum_value_{f}", .{enum_type.name.fmt(ip)}),
             toLlvmAddressSpace(.generic, target),
         );
 
@@ -11287,9 +11661,14 @@ pub const FuncGen = struct {
         // TODO: Address space
         const variable_index =
             try o.builder.addVariable(try o.builder.strtabString("__zig_err_name_table"), .ptr, .default);
-        variable_index.setLinkage(.private, &o.builder);
+        if (o.isSharded()) {
+            variable_index.setLinkage(.external, &o.builder);
+            variable_index.setVisibility(.hidden, &o.builder);
+        } else {
+            variable_index.setLinkage(.private, &o.builder);
+            variable_index.setUnnamedAddr(.unnamed_addr, &o.builder);
+        }
         variable_index.setMutability(.constant, &o.builder);
-        variable_index.setUnnamedAddr(.unnamed_addr, &o.builder);
         variable_index.setAlignment(
             Type.slice_const_u8_sentinel_0.abiAlignment(pt.zcu).toLlvm(),
             &o.builder,
diff --git a/src/link.zig b/src/link.zig
index ef9b9aaae5f6..248e06db0027 100644
--- a/src/link.zig
+++ b/src/link.zig
@@ -403,6 +403,30 @@ pub const File = struct {
     lock: ?Cache.Lock = null,
     child_pid: ?std.process.Child.Id = null,
 
+    /// Resolve the LLVM-generated ZCU object path(s), expanding to N partition
+    /// paths when parallel codegen produced multiple object files.
+    pub fn resolveZcuObjectPaths(base: *const File, arena: Allocator) Allocator.Error![]const Cache.Path {
+        const raw = base.zcu_object_basename orelse return &.{};
+        const single = try base.comp.resolveEmitPathFlush(arena, .temp, raw);
+        const n = base.zcu_object_partition_count;
+        if (n <= 1) {
+            const out = try arena.alloc(Cache.Path, 1);
+            out[0] = single;
+            return out;
+        }
+        const base_path = single.sub_path;
+        const base_name = if (std.mem.endsWith(u8, base_path, ".o"))
+            base_path[0 .. base_path.len - 2]
+        else
+            base_path;
+        const out = try arena.alloc(Cache.Path, n);
+        for (out, 0..) |*p, i| p.* = .{
+            .root_dir = single.root_dir,
+            .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}.o", .{ base_name, i }),
+        };
+        return out;
+    }
+
     pub const OpenOptions = struct {
         symbol_count_hint: u64 = 32,
         program_code_size_hint: u64 = 256 * 1024,
diff --git a/src/link/Elf/AtomList.zig b/src/link/Elf/AtomList.zig
index 0caa69ca9074..f99fd035b2fa 100644
--- a/src/link/Elf/AtomList.zig
+++ b/src/link/Elf/AtomList.zig
@@ -137,20 +137,68 @@ pub fn writeRelocatable(list: AtomList, buffer: *std.array_list.Managed(u8), elf
     try buffer.ensureUnusedCapacity(list_size);
     buffer.appendNTimesAssumeCapacity(0, list_size);
 
-    for (list.atoms.keys()) |ref| {
-        const atom_ptr = elf_file.atom(ref).?;
-        assert(atom_ptr.alive);
-
-        const off = math.cast(usize, atom_ptr.value - list.value) orelse return error.Overflow;
-        const size = math.cast(usize, atom_ptr.size) orelse return error.Overflow;
+    // Atoms arrive grouped by file then by input section index (see
+    // Object.initOutputSections). With per-function/COMDAT sections this loop
+    // would otherwise issue one pread + alloc per atom. Instead, coalesce
+    // contiguous input sections from the same object into a single pread and
+    // memcpy each atom's slice out of that span.
+    var in_data: std.ArrayListUnmanaged(u8) = .empty;
+    defer in_data.deinit(gpa);
+
+    const refs = list.atoms.keys();
+    var i: usize = 0;
+    while (i < refs.len) {
+        const head = elf_file.atom(refs[i]).?;
+        assert(head.alive);
+        const object = head.file(elf_file).?.object;
+        const head_shdr = object.shdrs.items[head.input_section_index];
+
+        if (head_shdr.sh_flags & elf.SHF_COMPRESSED != 0) {
+            const off = math.cast(usize, head.value - list.value) orelse return error.Overflow;
+            const size = math.cast(usize, head.size) orelse return error.Overflow;
+            log.debug("  atom({f}) at 0x{x}", .{ refs[i], list.offset(elf_file) + off });
+            const code = try object.codeDecompressAlloc(elf_file, refs[i].index);
+            defer gpa.free(code);
+            @memcpy(buffer.items[off..][0..size], code);
+            i += 1;
+            continue;
+        }
 
-        log.debug("  atom({f}) at 0x{x}", .{ ref, list.offset(elf_file) + off });
+        // Greedily extend a run of uncompressed sections from the same file
+        // that are contiguous on disk (allowing small alignment gaps so we
+        // never over-read by more than a page per boundary).
+        var run_start: u64 = head_shdr.sh_offset;
+        var run_end: u64 = head_shdr.sh_offset + head_shdr.sh_size;
+        var j = i + 1;
+        while (j < refs.len) : (j += 1) {
+            const next = elf_file.atom(refs[j]).?;
+            if (next.file_index != head.file_index) break;
+            const next_shdr = object.shdrs.items[next.input_section_index];
+            if (next_shdr.sh_flags & elf.SHF_COMPRESSED != 0) break;
+            const ns = next_shdr.sh_offset;
+            const ne = next_shdr.sh_offset + next_shdr.sh_size;
+            if (ns > run_end and ns - run_end > 4096) break;
+            if (ns < run_start) run_start = ns;
+            if (ne > run_end) run_end = ne;
+        }
 
-        const object = atom_ptr.file(elf_file).?.object;
-        const code = try object.codeDecompressAlloc(elf_file, ref.index);
-        defer gpa.free(code);
-        const out_code = buffer.items[off..][0..size];
-        @memcpy(out_code, code);
+        const ar_off: u64 = if (object.archive) |ar| ar.offset else 0;
+        const span = math.cast(usize, run_end - run_start) orelse return error.Overflow;
+        try in_data.resize(gpa, span);
+        const handle = elf_file.fileHandle(object.file_handle);
+        const amt = try handle.preadAll(in_data.items, ar_off + run_start);
+        if (amt != span) return error.InputOutput;
+
+        while (i < j) : (i += 1) {
+            const atom_ptr = elf_file.atom(refs[i]).?;
+            assert(atom_ptr.alive);
+            const off = math.cast(usize, atom_ptr.value - list.value) orelse return error.Overflow;
+            const size = math.cast(usize, atom_ptr.size) orelse return error.Overflow;
+            log.debug("  atom({f}) at 0x{x}", .{ refs[i], list.offset(elf_file) + off });
+            const shdr = object.shdrs.items[atom_ptr.input_section_index];
+            const src_off = math.cast(usize, shdr.sh_offset - run_start) orelse return error.Overflow;
+            @memcpy(buffer.items[off..][0..size], in_data.items[src_off..][0..size]);
+        }
     }
 
     try elf_file.base.file.?.pwriteAll(buffer.items, list.offset(elf_file));
diff --git a/src/link/Lld.zig b/src/link/Lld.zig
index 186827050291..528d297c091a 100644
--- a/src/link/Lld.zig
+++ b/src/link/Lld.zig
@@ -230,6 +230,7 @@ pub fn createEmpty(
             .comp = comp,
             .emit = emit,
             .zcu_object_basename = try allocPrint(arena, "{s}_zcu.{s}", .{ fs.path.stem(emit.sub_path), obj_file_ext }),
+            .zcu_object_partition_count = @intCast(options.llvm_codegen_threads),
             .gc_sections = gc_sections,
             .print_gc_sections = options.print_gc_sections,
             .stack_size = stack_size,
@@ -289,11 +290,12 @@ fn linkAsArchive(lld: *Lld, arena: Allocator) !void {
     const full_out_path_z = try arena.dupeZ(u8, full_out_path);
     const opt_zcu = comp.zcu;
 
-    const zcu_obj_path: ?Cache.Path = if (opt_zcu != null) p: {
-        break :p try comp.resolveEmitPathFlush(arena, .temp, base.zcu_object_basename.?);
-    } else null;
+    const zcu_obj_paths: []const Cache.Path = if (opt_zcu != null)
+        try base.resolveZcuObjectPaths(arena)
+    else
+        &.{};
 
-    log.debug("zcu_obj_path={?f}", .{zcu_obj_path});
+    log.debug("zcu_obj_paths.len={d}", .{zcu_obj_paths.len});
 
     const compiler_rt_path: ?Cache.Path = if (comp.compiler_rt_strat == .obj)
         comp.compiler_rt_obj.?.full_object_path
@@ -327,7 +329,7 @@ fn linkAsArchive(lld: *Lld, arena: Allocator) !void {
     for (comp.win32_resource_table.keys()) |key| {
         object_files.appendAssumeCapacity(try arena.dupeZ(u8, key.status.success.res_path));
     }
-    if (zcu_obj_path) |p| object_files.appendAssumeCapacity(try p.toStringZ(arena));
+    for (zcu_obj_paths) |p| try object_files.append(arena, try p.toStringZ(arena));
     if (compiler_rt_path) |p| object_files.appendAssumeCapacity(try p.toStringZ(arena));
     if (ubsan_rt_path) |p| object_files.appendAssumeCapacity(try p.toStringZ(arena));
 
@@ -365,9 +367,10 @@ fn coffLink(lld: *Lld, arena: Allocator) !void {
     const directory = base.emit.root_dir; // Just an alias to make it shorter to type.
     const full_out_path = try directory.join(arena, &[_][]const u8{base.emit.sub_path});
 
-    const zcu_obj_path: ?Cache.Path = if (comp.zcu != null) p: {
-        break :p try comp.resolveEmitPathFlush(arena, .temp, base.zcu_object_basename.?);
-    } else null;
+    const zcu_obj_paths: []const Cache.Path = if (comp.zcu != null)
+        try base.resolveZcuObjectPaths(arena)
+    else
+        &.{};
 
     const is_lib = comp.config.output_mode == .Lib;
     const is_dyn_lib = comp.config.link_mode == .dynamic and is_lib;
@@ -393,8 +396,8 @@ fn coffLink(lld: *Lld, arena: Allocator) !void {
             if (comp.c_object_table.count() != 0)
                 break :blk comp.c_object_table.keys()[0].status.success.object_path;
 
-            if (zcu_obj_path) |p|
-                break :blk p;
+            if (zcu_obj_paths.len > 0)
+                break :blk zcu_obj_paths[0];
 
             // TODO I think this is unreachable. Audit this situation when solving the above TODO
             // regarding eliding redundant object -> object transformations.
@@ -547,7 +550,7 @@ fn coffLink(lld: *Lld, arena: Allocator) !void {
             try argv.append(key.status.success.res_path);
         }
 
-        if (zcu_obj_path) |p| {
+        for (zcu_obj_paths) |p| {
             try argv.append(try p.toString(arena));
         }
 
@@ -799,9 +802,10 @@ fn elfLink(lld: *Lld, arena: Allocator) !void {
     const directory = base.emit.root_dir; // Just an alias to make it shorter to type.
     const full_out_path = try directory.join(arena, &[_][]const u8{base.emit.sub_path});
 
-    const zcu_obj_path: ?Cache.Path = if (comp.zcu != null) p: {
-        break :p try comp.resolveEmitPathFlush(arena, .temp, base.zcu_object_basename.?);
-    } else null;
+    const zcu_obj_paths: []const Cache.Path = if (comp.zcu != null)
+        try base.resolveZcuObjectPaths(arena)
+    else
+        &.{};
 
     const output_mode = comp.config.output_mode;
     const is_obj = output_mode == .Obj;
@@ -847,8 +851,8 @@ fn elfLink(lld: *Lld, arena: Allocator) !void {
             if (comp.c_object_table.count() != 0)
                 break :blk comp.c_object_table.keys()[0].status.success.object_path;
 
-            if (zcu_obj_path) |p|
-                break :blk p;
+            if (zcu_obj_paths.len == 1)
+                break :blk zcu_obj_paths[0];
 
             // TODO I think this is unreachable. Audit this situation when solving the above TODO
             // regarding eliding redundant object -> object transformations.
@@ -1134,7 +1138,7 @@ fn elfLink(lld: *Lld, arena: Allocator) !void {
             try argv.append(try key.status.success.object_path.toString(arena));
         }
 
-        if (zcu_obj_path) |p| {
+        for (zcu_obj_paths) |p| {
             try argv.append(try p.toString(arena));
         }
 
@@ -1370,9 +1374,10 @@ fn wasmLink(lld: *Lld, arena: Allocator) !void {
     const directory = base.emit.root_dir; // Just an alias to make it shorter to type.
     const full_out_path = try directory.join(arena, &[_][]const u8{base.emit.sub_path});
 
-    const zcu_obj_path: ?Cache.Path = if (comp.zcu != null) p: {
-        break :p try comp.resolveEmitPathFlush(arena, .temp, base.zcu_object_basename.?);
-    } else null;
+    const zcu_obj_paths: []const Cache.Path = if (comp.zcu != null)
+        try base.resolveZcuObjectPaths(arena)
+    else
+        &.{};
 
     const is_obj = comp.config.output_mode == .Obj;
     const compiler_rt_path: ?Cache.Path = blk: {
@@ -1396,8 +1401,8 @@ fn wasmLink(lld: *Lld, arena: Allocator) !void {
             if (comp.c_object_table.count() != 0)
                 break :blk comp.c_object_table.keys()[0].status.success.object_path;
 
-            if (zcu_obj_path) |p|
-                break :blk p;
+            if (zcu_obj_paths.len > 0)
+                break :blk zcu_obj_paths[0];
 
             // TODO I think this is unreachable. Audit this situation when solving the above TODO
             // regarding eliding redundant object -> object transformations.
@@ -1578,7 +1583,7 @@ fn wasmLink(lld: *Lld, arena: Allocator) !void {
         for (comp.c_object_table.keys()) |key| {
             try argv.append(try key.status.success.object_path.toString(arena));
         }
-        if (zcu_obj_path) |p| {
+        for (zcu_obj_paths) |p| {
             try argv.append(try p.toString(arena));
         }
 
diff --git a/src/link/MachO.zig b/src/link/MachO.zig
index a2629d7df675..76c63a3c0920 100644
--- a/src/link/MachO.zig
+++ b/src/link/MachO.zig
@@ -357,13 +357,13 @@ pub fn flush(
     if (comp.verbose_link) try self.dumpArgv(comp);
 
     if (self.getZigObject()) |zo| try zo.flush(self, tid);
-    if (self.base.isStaticLib()) return relocatable.flushStaticLib(self, comp, zcu_obj_path);
+    if (self.base.isStaticLib()) return relocatable.flushStaticLib(self, arena, comp, zcu_obj_path);
     if (self.base.isObject()) {
         // Skip linker if --no-link flag is set
         if (comp.no_link_obj) {
             return;
         }
-        return relocatable.flushObject(self, comp, zcu_obj_path);
+        return relocatable.flushObject(self, arena, comp, zcu_obj_path);
     }
 
     var positionals = std.array_list.Managed(link.Input).init(gpa);
@@ -386,27 +386,7 @@ pub fn flush(
         positionals.appendAssumeCapacity(try link.openObjectInput(diags, key.status.success.object_path));
     }
 
-    // Parse LLVM-generated object file(s) - handle parallel codegen partitions
-    if (zcu_obj_path) |path| {
-        const partition_count = self.base.zcu_object_partition_count;
-        if (partition_count > 1) {
-            const base_path = path.sub_path;
-            const base_name = if (std.mem.endsWith(u8, base_path, ".o"))
-                base_path[0 .. base_path.len - 2]
-            else
-                base_path;
-
-            for (0..partition_count) |i| {
-                const partition_path: Path = .{
-                    .root_dir = path.root_dir,
-                    .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}.o", .{ base_name, i }),
-                };
-                try positionals.append(try link.openObjectInput(diags, partition_path));
-            }
-        } else {
-            try positionals.append(try link.openObjectInput(diags, path));
-        }
-    }
+    try self.appendZcuObjectInputs(arena, &positionals, zcu_obj_path);
 
     if (comp.config.any_sanitize_thread) {
         try positionals.append(try link.openObjectInput(diags, comp.tsan_lib.?.full_object_path));
@@ -645,6 +625,36 @@ pub fn flush(
     }
 }
 
+/// Append the LLVM-generated ZCU object(s) to the positionals list, expanding
+/// to N partition paths when parallel codegen produced multiple object files.
+pub fn appendZcuObjectInputs(
+    self: *MachO,
+    arena: Allocator,
+    positionals: *std.array_list.Managed(link.Input),
+    zcu_obj_path: ?Path,
+) !void {
+    const diags = &self.base.comp.link_diags;
+    const path = zcu_obj_path orelse return;
+    const partition_count = self.base.zcu_object_partition_count;
+    if (partition_count > 1) {
+        const base_path = path.sub_path;
+        const base_name = if (std.mem.endsWith(u8, base_path, ".o"))
+            base_path[0 .. base_path.len - 2]
+        else
+            base_path;
+
+        for (0..partition_count) |i| {
+            const partition_path: Path = .{
+                .root_dir = path.root_dir,
+                .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}.o", .{ base_name, i }),
+            };
+            try positionals.append(try link.openObjectInput(diags, partition_path));
+        }
+    } else {
+        try positionals.append(try link.openObjectInput(diags, path));
+    }
+}
+
 /// --verbose-link output
 fn dumpArgv(self: *MachO, comp: *Compilation) !void {
     const gpa = self.base.comp.gpa;
diff --git a/src/link/MachO/Object.zig b/src/link/MachO/Object.zig
index b0687330a3c2..612ff0f01a52 100644
--- a/src/link/MachO/Object.zig
+++ b/src/link/MachO/Object.zig
@@ -875,6 +875,11 @@ fn initSymbols(self: *Object, allocator: Allocator, macho_file: *MachO) !void {
         if (nlist.ext()) {
             if (nlist.undf()) {
                 symbol.flags.weak_ref = nlist.weakRef();
+                // Private-extern commons (e.g. asan's
+                // `____asan_globals_registered`) are N_PEXT|N_EXT|N_UNDF —
+                // record their hidden visibility so `markExportsRelocatable`
+                // and `setOutputSym` emit them as private-extern, not local.
+                if (nlist.pext()) symbol.visibility = .hidden;
             } else if (nlist.pext() or (nlist.weakDef() and nlist.weakRef()) or self.hidden) {
                 symbol.visibility = .hidden;
             } else {
@@ -1586,10 +1591,12 @@ pub fn convertTentativeDefinitions(self: *Object, macho_file: *MachO) !void {
         sym.flags.weak = false;
         sym.flags.weak_ref = false;
         sym.flags.tentative = false;
-        sym.visibility = .global;
+        // Preserve hidden visibility (private-extern commons stay private).
+        if (sym.visibility == .local) sym.visibility = .global;
 
         nlist.n_value = 0;
         nlist.n_type = macho.N_EXT | macho.N_SECT;
+        if (sym.visibility == .hidden) nlist.n_type |= macho.N_PEXT;
         nlist.n_sect = 0;
         nlist.n_desc = 0;
         nlist_atom.* = atom_index;
diff --git a/src/link/MachO/Symbol.zig b/src/link/MachO/Symbol.zig
index 654e7c402c82..ac6be540737b 100644
--- a/src/link/MachO/Symbol.zig
+++ b/src/link/MachO/Symbol.zig
@@ -243,8 +243,8 @@ pub fn setOutputSym(symbol: Symbol, macho_file: *MachO, out: *macho.nlist_64) vo
             else => {},
         }
     } else if (symbol.flags.@"export") {
-        assert(symbol.visibility == .global);
         out.n_type = macho.N_EXT;
+        if (symbol.visibility == .hidden) out.n_type |= macho.N_PEXT;
         out.n_type |= if (symbol.flags.abs) macho.N_ABS else macho.N_SECT;
         out.n_sect = if (symbol.flags.abs) 0 else @intCast(symbol.getOutputSectionIndex(macho_file) + 1);
         out.n_value = symbol.getAddress(.{ .stubs = false }, macho_file);
diff --git a/src/link/MachO/file.zig b/src/link/MachO/file.zig
index 225021aa0f40..d4e611749329 100644
--- a/src/link/MachO/file.zig
+++ b/src/link/MachO/file.zig
@@ -149,7 +149,12 @@ pub const File = union(enum) {
             const ref = file.getSymbolRef(@intCast(i), macho_file);
             const other_file = ref.getFile(macho_file) orelse continue;
             if (other_file.getIndex() != file.getIndex()) continue;
-            if (sym.visibility != .global) continue;
+            // Hidden defined symbols (input N_PEXT|N_EXT, e.g. cross-shard
+            // `external hidden` LLVM globals or private-extern commons) keep
+            // `r_extern=1` relocations targeting them. Apple `ld_new` requires
+            // such targets to live in the extdef partition (have N_EXT), so
+            // mark them as exports here; `setOutputSym` re-adds N_PEXT below.
+            if (sym.visibility == .local) continue;
             sym.flags.@"export" = true;
         }
     }
diff --git a/src/link/MachO/relocatable.zig b/src/link/MachO/relocatable.zig
index e962ce3fd258..8f366aac441e 100644
--- a/src/link/MachO/relocatable.zig
+++ b/src/link/MachO/relocatable.zig
@@ -1,4 +1,4 @@
-pub fn flushObject(macho_file: *MachO, comp: *Compilation, module_obj_path: ?Path) link.File.FlushError!void {
+pub fn flushObject(macho_file: *MachO, arena: Allocator, comp: *Compilation, module_obj_path: ?Path) link.File.FlushError!void {
     const gpa = macho_file.base.comp.gpa;
     const diags = &macho_file.base.comp.link_diags;
 
@@ -12,7 +12,7 @@ pub fn flushObject(macho_file: *MachO, comp: *Compilation, module_obj_path: ?Pat
         try positionals.append(try link.openObjectInput(diags, key.status.success.object_path));
     }
 
-    if (module_obj_path) |path| try positionals.append(try link.openObjectInput(diags, path));
+    try macho_file.appendZcuObjectInputs(arena, &positionals, module_obj_path);
 
     if (macho_file.getZigObject() == null and positionals.items.len == 1) {
         // Instead of invoking a full-blown `-r` mode on the input which sadly will strip all
@@ -47,6 +47,14 @@ pub fn flushObject(macho_file: *MachO, comp: *Compilation, module_obj_path: ?Pat
         error.LinkFailure => return error.LinkFailure,
         else => |e| return diags.fail("failed to update ar size: {s}", .{@errorName(e)}),
     };
+    // Apple `ld_new` rejects `r_extern=1` relocations whose target lands in
+    // the local symtab range. Tentative (common) symbols — notably asan's
+    // private-extern `____asan_globals_registered` — would otherwise be
+    // emitted as locals; convert them to real `__DATA,__common` definitions
+    // so they sit in the extdef partition like other exports.
+    for (macho_file.objects.items) |index| {
+        try macho_file.getFile(index).?.object.convertTentativeDefinitions(macho_file);
+    }
     markExports(macho_file);
     claimUnresolved(macho_file);
     try initOutputSections(macho_file);
@@ -77,7 +85,7 @@ pub fn flushObject(macho_file: *MachO, comp: *Compilation, module_obj_path: ?Pat
     try writeHeader(macho_file, ncmds, sizeofcmds);
 }
 
-pub fn flushStaticLib(macho_file: *MachO, comp: *Compilation, module_obj_path: ?Path) link.File.FlushError!void {
+pub fn flushStaticLib(macho_file: *MachO, arena: Allocator, comp: *Compilation, module_obj_path: ?Path) link.File.FlushError!void {
     const gpa = comp.gpa;
     const diags = &macho_file.base.comp.link_diags;
 
@@ -91,7 +99,7 @@ pub fn flushStaticLib(macho_file: *MachO, comp: *Compilation, module_obj_path: ?
         try positionals.append(try link.openObjectInput(diags, key.status.success.object_path));
     }
 
-    if (module_obj_path) |path| try positionals.append(try link.openObjectInput(diags, path));
+    try macho_file.appendZcuObjectInputs(arena, &positionals, module_obj_path);
 
     if (comp.compiler_rt_strat == .obj) {
         try positionals.append(try link.openObjectInput(diags, comp.compiler_rt_obj.?.full_object_path));
@@ -776,6 +784,7 @@ fn writeHeader(macho_file: *MachO, ncmds: usize, sizeofcmds: usize) !void {
 }
 
 const std = @import("std");
+const Allocator = std.mem.Allocator;
 const Path = std.Build.Cache.Path;
 const WaitGroup = std.Thread.WaitGroup;
 const assert = std.debug.assert;
diff --git a/src/main.zig b/src/main.zig
index 476338e51bd5..6bf88820dd7c 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -166,6 +166,7 @@ var debug_allocator: std.heap.DebugAllocator(.{
 }) = .init;
 
 pub fn main() anyerror!void {
+    Compilation.phaseTiming("main.entry");
     crash_report.initialize();
 
     const gpa, const is_debug = gpa: {
@@ -475,6 +476,7 @@ const usage_build_generic =
     \\  -ffunction-sections       Places each function in a separate section
     \\  -fno-function-sections    All functions go into same section
     \\  --llvm-codegen-threads=[threads] Number of threads for LLVM codegen (0=single-threaded)
+    \\  --llvm-shard-stats        Print per-shard file/decl distribution and exit codegen
     \\  --no-link                 Skip linker step for build-obj (outputs raw LLVM object)
     \\  -fdata-sections           Places each data in a separate section
     \\  -fno-data-sections        All data go into same section
@@ -879,6 +881,8 @@ fn buildOutputType(
     var linker_print_map: bool = false;
     var llvm_opt_bisect_limit: c_int = -1;
     var llvm_codegen_threads: u32 = 0;
+    var llvm_shard_stats: bool = false;
+    var llvm_no_merge_shards: bool = false;
     var no_link_obj: bool = false;
     var linker_z_nocopyreloc = false;
     var linker_z_nodelete = false;
@@ -1631,6 +1635,10 @@ fn buildOutputType(
                     } else if (mem.startsWith(u8, arg, "--llvm-codegen-threads=")) {
                         llvm_codegen_threads = std.fmt.parseInt(u32, arg["--llvm-codegen-threads=".len..], 10) catch |err|
                             fatal("unable to parse '{s}': {s}", .{ arg, @errorName(err) });
+                    } else if (mem.eql(u8, arg, "--llvm-shard-stats")) {
+                        llvm_shard_stats = true;
+                    } else if (mem.eql(u8, arg, "--llvm-no-merge-shards")) {
+                        llvm_no_merge_shards = true;
                     } else if (mem.eql(u8, arg, "--no-link")) {
                         no_link_obj = true;
                     } else if (mem.eql(u8, arg, "--eh-frame-hdr")) {
@@ -3492,6 +3500,8 @@ fn buildOutputType(
         .linker_print_map = linker_print_map,
         .llvm_opt_bisect_limit = llvm_opt_bisect_limit,
         .llvm_codegen_threads = llvm_codegen_threads,
+        .llvm_shard_stats = llvm_shard_stats,
+        .llvm_no_merge_shards = llvm_no_merge_shards,
         .no_link_obj = no_link_obj,
         .linker_global_base = linker_global_base,
         .linker_export_symbol_names = linker_export_symbol_names.items,
@@ -5420,6 +5430,7 @@ fn cmdBuild(gpa: Allocator, arena: Allocator, args: []const []const u8) !void {
 
             try root_mod.deps.put(arena, "@build", build_mod);
 
+            Compilation.phaseTiming("cmdBuild.runner_compile_start");
             var create_diag: Compilation.CreateDiagnostic = undefined;
             const comp = Compilation.create(gpa, arena, &create_diag, .{
                 .libc_installation = libc_installation,
@@ -5453,6 +5464,7 @@ fn cmdBuild(gpa: Allocator, arena: Allocator, args: []const []const u8) !void {
                 error.CompileErrorsReported => process.exit(2),
                 else => |e| return e,
             };
+            Compilation.phaseTiming("cmdBuild.runner_compile_done");
 
             // Since incremental compilation isn't done yet, we use cache_mode = whole
             // above, and thus the output file is already closed.
@@ -5478,6 +5490,7 @@ fn cmdBuild(gpa: Allocator, arena: Allocator, args: []const []const u8) !void {
                 child.progress_node = root_prog_node;
             }
 
+            Compilation.phaseTiming("cmdBuild.runner_spawn");
             const term = t: {
                 std.debug.lockStdErr();
                 defer std.debug.unlockStdErr();
@@ -5485,6 +5498,7 @@ fn cmdBuild(gpa: Allocator, arena: Allocator, args: []const []const u8) !void {
                     fatal("failed to spawn build runner {s}: {s}", .{ child_argv.items[0], @errorName(err) });
                 };
             };
+            Compilation.phaseTiming("cmdBuild.runner_exit");
 
             switch (term) {
                 .Exited => |code| {
diff --git a/src/target.zig b/src/target.zig
index cf0e28215417..48a58522a2aa 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -885,10 +885,8 @@ pub inline fn backendSupportsFeature(backend: std.builtin.CompilerBackend, compt
             else => false,
         },
         .separate_thread => switch (backend) {
-            // Supports a separate thread but does not support N separate
-            // threads because they would all just be locking the same mutex to
-            // protect Builder.
-            .stage2_llvm => false,
+            // PartitionSet shards Builder state so contention is 1/N.
+            .stage2_llvm => true,
             // Same problem. Frontend needs to allow this backend to run in the
             // linker thread.
             .stage2_spirv => false,
diff --git a/src/zig_llvm.cpp b/src/zig_llvm.cpp
index 42e2a773a1aa..e439d20b7331 100644
--- a/src/zig_llvm.cpp
+++ b/src/zig_llvm.cpp
@@ -239,72 +239,14 @@ static AddressSanitizerOptions getAsanOptions(void) {
     return o;
 }
 
-ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machine_ref, LLVMModuleRef module_ref,
-    char **error_message, const ZigLLVMEmitOptions *options)
+// Builds and runs the full middle-end optimization pipeline on `llvm_module`.
+// Self-contained so it can be invoked once on the whole module (serial path) or
+// per split partition on worker threads (parallel path). All analysis managers
+// and the PassBuilder are local, so concurrent calls on distinct modules with
+// distinct LLVMContexts and TargetMachines are safe.
+static void runOptimizationPipeline(Module &llvm_module, TargetMachine &target_machine,
+    const ZigLLVMEmitOptions *options)
 {
-    TimePassesIsEnabled = options->time_report_out != nullptr;
-
-    raw_fd_ostream *dest_asm_ptr = nullptr;
-    raw_fd_ostream *dest_bin_ptr = nullptr;
-    raw_fd_ostream *dest_bitcode_ptr = nullptr;
-
-    if (options->asm_filename) {
-        std::error_code EC;
-        dest_asm_ptr = new(std::nothrow) raw_fd_ostream(options->asm_filename, EC, sys::fs::OF_None);
-        if (EC) {
-            *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
-            return true;
-        }
-    }
-    // Open single bin file if not using parallel codegen
-    // Check early if parallel will actually be used
-    bool will_use_parallel = options->bin_filename_list != nullptr &&
-                             !options->lto &&
-                             !options->asm_filename;
-
-    if (options->bin_filename && !will_use_parallel) {
-        std::error_code EC;
-        dest_bin_ptr = new(std::nothrow) raw_fd_ostream(options->bin_filename, EC, sys::fs::OF_None);
-        if (EC) {
-            *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
-            return true;
-        }
-    }
-    if (options->bitcode_filename) {
-        std::error_code EC;
-        dest_bitcode_ptr = new(std::nothrow) raw_fd_ostream(options->bitcode_filename, EC, sys::fs::OF_None);
-        if (EC) {
-            *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
-            return true;
-        }
-    }
-
-    std::unique_ptr<raw_fd_ostream> dest_asm(dest_asm_ptr),
-                                    dest_bin(dest_bin_ptr),
-                                    dest_bitcode(dest_bitcode_ptr);
-
-
-    auto PID = sys::Process::getProcessId();
-    std::string ProcName = "zig-";
-    ProcName += std::to_string(PID);
-    TimeTracerRAII TimeTracer(ProcName,
-                              options->bin_filename? options->bin_filename : options->asm_filename);
-
-    TargetMachine &target_machine = *reinterpret_cast<TargetMachine*>(targ_machine_ref);
-
-    if (options->allow_fast_isel) {
-        target_machine.setO0WantsFastISel(true);
-    } else {
-        target_machine.setFastISel(false);
-    }
-
-    if (!options->allow_machine_outliner) {
-        target_machine.setMachineOutliner(false);
-    }
-
-    Module &llvm_module = *unwrap(module_ref);
-
-    // Pipeline configurations
     PipelineTuningOptions pipeline_opts;
     pipeline_opts.LoopUnrolling = !options->is_debug;
     pipeline_opts.SLPVectorization = !options->is_debug;
@@ -312,7 +254,6 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
     pipeline_opts.LoopInterleaving = !options->is_debug;
     pipeline_opts.MergeFunctions = !options->is_debug;
 
-    // Instrumentations
     PassInstrumentationCallbacks instr_callbacks;
     StandardInstrumentations std_instrumentations(llvm_module.getContext(), false);
     std_instrumentations.registerCallbacks(instr_callbacks);
@@ -326,7 +267,6 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
     CGSCCAnalysisManager cgscc_am;
     ModuleAnalysisManager module_am;
 
-    // Register the AA manager first so that our version is the one used
     function_am.registerPass([&] {
       return pass_builder.buildDefaultAAPipeline();
     });
@@ -335,15 +275,13 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
     auto tlii = std::make_unique<TargetLibraryInfoImpl>(target_triple);
     function_am.registerPass([&] { return TargetLibraryAnalysis(*tlii); });
 
-    // Initialize the AnalysisManagers
     pass_builder.registerModuleAnalyses(module_am);
     pass_builder.registerCGSCCAnalyses(cgscc_am);
     pass_builder.registerFunctionAnalyses(function_am);
     pass_builder.registerLoopAnalyses(loop_am);
     pass_builder.crossRegisterProxies(loop_am, function_am, cgscc_am, module_am);
 
-    pass_builder.registerPipelineStartEPCallback([&](ModulePassManager &module_pm, OptimizationLevel level) {
-        // Verify the input
+    pass_builder.registerPipelineStartEPCallback([options](ModulePassManager &module_pm, OptimizationLevel level) {
         if (assertions_on) {
             module_pm.addPass(VerifierPass());
         }
@@ -352,7 +290,6 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
             module_pm.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
         }
 
-        // GCOV profiling instrumentation
         if (options->gcov_profiling) {
             GCOVOptions gcov_opts = GCOVOptions::getDefault();
             module_pm.addPass(GCOVProfilerPass(gcov_opts));
@@ -361,14 +298,12 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
 
     const bool early_san = options->is_debug;
 
-    pass_builder.registerOptimizerEarlyEPCallback([&](ModulePassManager &module_pm, OptimizationLevel level, ThinOrFullLTOPhase lto_phase) {
+    pass_builder.registerOptimizerEarlyEPCallback([options, early_san](ModulePassManager &module_pm, OptimizationLevel level, ThinOrFullLTOPhase lto_phase) {
         if (early_san) {
-            // Code coverage instrumentation.
             if (options->sancov) {
                 module_pm.addPass(SanitizerCoveragePass(getSanCovOptions(options->coverage)));
             }
 
-            // Thread sanitizer
             if (options->tsan) {
                 module_pm.addPass(ModuleThreadSanitizerPass());
                 module_pm.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
@@ -376,21 +311,18 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
         }
     });
 
-    pass_builder.registerOptimizerLastEPCallback([&](ModulePassManager &module_pm, OptimizationLevel level, ThinOrFullLTOPhase lto_phase) {
+    pass_builder.registerOptimizerLastEPCallback([options, early_san](ModulePassManager &module_pm, OptimizationLevel level, ThinOrFullLTOPhase lto_phase) {
         if (!early_san) {
-            // Code coverage instrumentation.
             if (options->sancov) {
                 module_pm.addPass(SanitizerCoveragePass(getSanCovOptions(options->coverage)));
             }
 
-            // Thread sanitizer
             if (options->tsan) {
                 module_pm.addPass(ModuleThreadSanitizerPass());
                 module_pm.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
             }
         }
 
-        // Address sanitizer
         if (options->asan) {
             bool UseOdrIndicator = false;
             AddressSanitizerOptions Opts;
@@ -401,7 +333,6 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
             module_pm.addPass(AddressSanitizerPass(Opts, true, UseOdrIndicator, AsanDtorKind::Global, AsanCtorKind::Global));
         }
 
-        // Verify the output
         if (assertions_on) {
             module_pm.addPass(VerifierPass());
         }
@@ -409,7 +340,6 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
 
     ModulePassManager module_pm;
     OptimizationLevel opt_level;
-    // Setting up the optimization level
     if (options->is_debug)
       opt_level = OptimizationLevel::O0;
     else if (options->is_small)
@@ -417,7 +347,6 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
     else
       opt_level = OptimizationLevel::O3;
 
-    // Initialize the PassManager
     if (opt_level == OptimizationLevel::O0) {
       module_pm = pass_builder.buildO0DefaultPipeline(opt_level, static_cast<ThinOrFullLTOPhase>(options->lto));
     } else if (options->lto) {
@@ -426,38 +355,91 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
       module_pm = pass_builder.buildPerModuleDefaultPipeline(opt_level);
     }
 
-    // Optimization phase
     module_pm.run(llvm_module, module_am);
+}
 
-    // Code generation phase
-    // Check if we should use parallel codegen (same condition as will_use_parallel above)
-    bool use_parallel_codegen = options->bin_filename_list != nullptr &&
-                                !options->lto &&
-                                !options->asm_filename;
+ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machine_ref, LLVMModuleRef module_ref,
+    char **error_message, const ZigLLVMEmitOptions *options)
+{
+    TimePassesIsEnabled = options->time_report_out != nullptr;
+
+    raw_fd_ostream *dest_asm_ptr = nullptr;
+    raw_fd_ostream *dest_bin_ptr = nullptr;
+    raw_fd_ostream *dest_bitcode_ptr = nullptr;
 
-    if (use_parallel_codegen) {
-        // Count number of output files (NULL-terminated array)
-        unsigned NumThreads = 0;
+    if (options->asm_filename) {
+        std::error_code EC;
+        dest_asm_ptr = new(std::nothrow) raw_fd_ostream(options->asm_filename, EC, sys::fs::OF_None);
+        if (EC) {
+            *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
+            return true;
+        }
+    }
+    // Decide up front whether to use the parallel split path. The split path
+    // optimizes and emits each partition independently, so it is disabled when
+    // an output requires the whole post-optimization module (LTO, asm) or when
+    // fewer than two partitions were requested. The caller must keep this in
+    // sync with the linker's expected object count; the Zig side never sets
+    // bin_filename_list together with asm or LTO.
+    unsigned NumThreads = 0;
+    if (options->bin_filename_list != nullptr) {
         while (options->bin_filename_list[NumThreads] != nullptr) {
             NumThreads++;
         }
+    }
+    bool will_use_parallel = NumThreads > 1 &&
+                             !options->lto &&
+                             !options->asm_filename;
 
-        if (NumThreads <= 1) {
-            use_parallel_codegen = false;
+    if (options->bin_filename && !will_use_parallel) {
+        std::error_code EC;
+        dest_bin_ptr = new(std::nothrow) raw_fd_ostream(options->bin_filename, EC, sys::fs::OF_None);
+        if (EC) {
+            *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
+            return true;
         }
     }
-
-    if (use_parallel_codegen) {
-        // Parallel code generation path
-        unsigned NumThreads = 0;
-        while (options->bin_filename_list[NumThreads] != nullptr) {
-            NumThreads++;
+    if (options->bitcode_filename) {
+        std::error_code EC;
+        dest_bitcode_ptr = new(std::nothrow) raw_fd_ostream(options->bitcode_filename, EC, sys::fs::OF_None);
+        if (EC) {
+            *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
+            return true;
         }
+    }
+
+    std::unique_ptr<raw_fd_ostream> dest_asm(dest_asm_ptr),
+                                    dest_bin(dest_bin_ptr),
+                                    dest_bitcode(dest_bitcode_ptr);
+
+
+    auto PID = sys::Process::getProcessId();
+    std::string ProcName = "zig-";
+    ProcName += std::to_string(PID);
+    TimeTracerRAII TimeTracer(ProcName,
+                              options->bin_filename? options->bin_filename : options->asm_filename);
 
+    TargetMachine &target_machine = *reinterpret_cast<TargetMachine*>(targ_machine_ref);
+
+    if (options->allow_fast_isel) {
+        target_machine.setO0WantsFastISel(true);
+    } else {
+        target_machine.setFastISel(false);
+    }
+
+    if (!options->allow_machine_outliner) {
+        target_machine.setMachineOutliner(false);
+    }
+
+    Module &llvm_module = *unwrap(module_ref);
+
+    if (will_use_parallel) {
+        // Parallel path: split the unoptimized module into N partitions, then run
+        // the full optimization pipeline and object emission on each partition
+        // concurrently. Each worker owns its own LLVMContext and TargetMachine.
         std::vector<std::unique_ptr<raw_fd_ostream>> temp_streams;
         std::vector<raw_pwrite_stream *> stream_ptrs;
 
-        // Create N output streams using the provided filenames
         for (unsigned i = 0; i < NumThreads; ++i) {
             std::error_code EC;
             auto stream = std::make_unique<raw_fd_ostream>(options->bin_filename_list[i], EC, sys::fs::OF_None);
@@ -469,10 +451,9 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
             temp_streams.push_back(std::move(stream));
         }
 
-        // TargetMachine factory - creates a new TM for each thread
         Target *TheTarget = reinterpret_cast<Target*>(const_cast<void*>(
             reinterpret_cast<const void*>(&target_machine.getTarget())));
-        std::string Triple = std::string(target_machine.getTargetTriple().str());
+        std::string TripleStr = std::string(target_machine.getTargetTriple().str());
         std::string CPU = std::string(target_machine.getTargetCPU());
         std::string Features = std::string(target_machine.getTargetFeatureString());
         CodeGenOptLevel CGOptLevel = target_machine.getOptLevel();
@@ -482,16 +463,18 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
 
         auto TMFactory = [=]() -> std::unique_ptr<TargetMachine> {
             std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
-                Triple, CPU, Features, Opts, RM, CM, CGOptLevel, false));
+                TripleStr, CPU, Features, Opts, RM, CM, CGOptLevel, false));
             if (options->allow_fast_isel) {
                 TM->setO0WantsFastISel(true);
             } else {
                 TM->setFastISel(false);
             }
+            if (!options->allow_machine_outliner) {
+                TM->setMachineOutliner(false);
+            }
             return TM;
         };
 
-        // Manual parallel code generation (same as llvm::splitCodeGen)
         {
             llvm::StdThreadPool CodegenThreadPool(llvm::hardware_concurrency(NumThreads));
             std::atomic<unsigned> ThreadCount(0);
@@ -506,7 +489,7 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
                     llvm::raw_pwrite_stream *ThreadOS = stream_ptrs[ThreadCount++];
 
                     CodegenThreadPool.async(
-                        [TMFactory, ThreadOS](const SmallString<0> &BC) {
+                        [TMFactory, ThreadOS, options](const SmallString<0> &BC) {
                             LLVMContext Ctx;
                             auto BufferRef = MemoryBufferRef(StringRef(BC.data(), BC.size()), "<split-module>");
                             Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(BufferRef, Ctx);
@@ -520,6 +503,9 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
                             std::unique_ptr<Module> MPartInCtx = std::move(*MOrErr);
 
                             std::unique_ptr<TargetMachine> TM = TMFactory();
+
+                            runOptimizationPipeline(*MPartInCtx, *TM, options);
+
                             legacy::PassManager CodeGenPasses;
                             if (TM->addPassesToEmitFile(CodeGenPasses, *ThreadOS, nullptr, CodeGenFileType::ObjectFile))
                                 report_fatal_error("Failed to setup codegen");
@@ -527,19 +513,17 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
                         },
                         std::move(BC));
                 },
-                true);  // avoid symbol globalization overhead
+                false);
         }
 
-        // Flush and close streams
         for (auto &stream : temp_streams) {
             stream->flush();
         }
         temp_streams.clear();
-
-        // Output files are now: bin_filename.0.o, bin_filename.1.o, ..., bin_filename.(N-1).o
-        // The linker will automatically pick up all of them
     } else {
-        // Single-threaded code generation path (original)
+        // Serial path: optimize the whole module, then emit.
+        runOptimizationPipeline(llvm_module, target_machine, options);
+
         legacy::PassManager codegen_pm;
         codegen_pm.add(
           createTargetTransformInfoWrapperPass(target_machine.getTargetIRAnalysis()));

From 09c8f59cfc6457c6f9f6b517ed1f19af1154c99e Mon Sep 17 00:00:00 2001
From: Alistair Smith <hi@alistair.sh>
Date: Thu, 9 Apr 2026 15:58:08 -0700
Subject: [PATCH 02/15] psema: fix 14 races and correctness bugs found via
 adversarial sweep

Memory model (ARM64):
- getNav seqlock: payload loads .unordered -> .acquire so b2 cannot
  reorder before them (LDAR-before-LDAR is ordered; @fence is gone).
- setFieldTypesAlignsAll: memcpy [0..len-1) then release-store [len-1]
  inside the mutex; remove the post-mutex re-store in structFields.

.removed/.existing race cluster:
- awaitNamespaceTypeFinished returns {finished, cancelled}.
- 9 Sema .existing arms (zir*Decl, anon-struct-init, reify*) wrapped in
  gop:while(true) retry loops; cancelled re-runs get*Type.
- getOrPutKeyInner locked re-probe skips .removed (mirror lockless path).

Retry/requeue:
- codegen_func: reset tls_retry_loop before resolveTypesFully; on retry
  requeue the job instead of dropping the body.
- ensureMemoizedStateUpToDate: re-probe sentinel decl on .done.

Misc:
- deleteUnitReferences: capture parent + write self-loop marker before
  free-list append, all under inline_ref_mutex (fixes UAF on realloc).
- test_functions.contains: take test_functions_mutex.
- Lld coffLink/wasmLink: error on multi-shard build-obj instead of
  silently dropping shards 1..N.
- PartitionSet.emit: keep asm_path for shard 0.
- types_resolved: propagate OOM instead of swallowing as false.
- dumpLlvmShardStats: clamp n<=256; per-(file,shard) top-file key.
---
 src/Air.zig                |  10 ++-
 src/Air/types_resolved.zig | 172 ++++++++++++++++++++-----------------
 src/Compilation.zig        |  22 +++--
 src/InternPool.zig         |  40 +++++++--
 src/Sema.zig               | 117 ++++++++++++++++---------
 src/Type.zig               |  13 +--
 src/Zcu.zig                |  41 +++++----
 src/Zcu/PerThread.zig      |  18 +++-
 src/codegen/llvm.zig       |  56 ++++++------
 src/link/Lld.zig           |  16 +++-
 10 files changed, 308 insertions(+), 197 deletions(-)

diff --git a/src/Air.zig b/src/Air.zig
index 97dcc52c44b9..ec2dc44f252e 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -2155,8 +2155,14 @@ pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index
 
 pub const typesFullyResolved = types_resolved.typesFullyResolved;
 pub const resolveTypesFully = types_resolved.resolveTypesFully;
-pub const typeFullyResolved = types_resolved.checkType;
-pub const valFullyResolved = types_resolved.checkVal;
+/// `checkType`/`checkVal` only allocate when `tls_resolve_pt` is set (i.e. via
+/// `resolveTypesFully`); these wrappers are for the non-resolving query path.
+pub fn typeFullyResolved(ty: Type, zcu: *Zcu) bool {
+    return types_resolved.checkType(ty, zcu) catch unreachable;
+}
+pub fn valFullyResolved(val: Value, zcu: *Zcu) bool {
+    return types_resolved.checkVal(val, zcu) catch unreachable;
+}
 pub const legalize = Legalize.legalize;
 pub const write = print.write;
 pub const writeInst = print.writeInst;
diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig
index 8c4c69fa9c63..cd53a274a62f 100644
--- a/src/Air/types_resolved.zig
+++ b/src/Air/types_resolved.zig
@@ -1,3 +1,5 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
 const Air = @import("../Air.zig");
 const Zcu = @import("../Zcu.zig");
 const Type = @import("../Type.zig");
@@ -7,7 +9,9 @@ const InternPool = @import("../InternPool.zig");
 /// Given a body of AIR instructions, returns whether all type resolution necessary for codegen is complete.
 /// If `false`, then type resolution must have failed, so codegen cannot proceed.
 pub fn typesFullyResolved(air: Air, zcu: *Zcu) bool {
-    return checkBody(air, air.getMainBody(), zcu);
+    // `tls_resolve_pt` is null here, so `resolveFully` is never called and
+    // `checkBody` cannot return `error.OutOfMemory`.
+    return checkBody(air, air.getMainBody(), zcu) catch unreachable;
 }
 
 /// Under parallel Sema, `resolve_type_fully` and `codegen_func` run
@@ -15,7 +19,7 @@ pub fn typesFullyResolved(air: Air, zcu: *Zcu) bool {
 /// same AIR shape as `typesFullyResolved` but force-resolve each struct/union
 /// (blocking on `claimOrWait`-gated resolution). Returns false only if
 /// resolution itself errors.
-pub fn resolveTypesFully(air: Air, pt: Zcu.PerThread) bool {
+pub fn resolveTypesFully(air: Air, pt: Zcu.PerThread) Allocator.Error!bool {
     tls_resolve_pt = pt;
     defer tls_resolve_pt = null;
     return checkBody(air, air.getMainBody(), pt.zcu);
@@ -25,7 +29,7 @@ pub fn resolveTypesFully(air: Air, pt: Zcu.PerThread) bool {
 /// PerThread via tls instead of plumbing it through every switch arm.
 threadlocal var tls_resolve_pt: ?Zcu.PerThread = null;
 
-fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
+fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) Allocator.Error!bool {
     const tags = air.instructions.items(.tag);
     const datas = air.instructions.items(.data);
 
@@ -35,7 +39,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             .inferred_alloc, .inferred_alloc_comptime => unreachable,
 
             .arg => {
-                if (!checkType(data.arg.ty.toType(), zcu)) return false;
+                if (!try checkType(data.arg.ty.toType(), zcu)) return false;
             },
 
             .add,
@@ -104,8 +108,8 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             .atomic_store_release,
             .atomic_store_seq_cst,
             => {
-                if (!checkRef(data.bin_op.lhs, zcu)) return false;
-                if (!checkRef(data.bin_op.rhs, zcu)) return false;
+                if (!try checkRef(data.bin_op.lhs, zcu)) return false;
+                if (!try checkRef(data.bin_op.rhs, zcu)) return false;
             },
 
             .not,
@@ -154,15 +158,15 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             .c_va_arg,
             .c_va_copy,
             => {
-                if (!checkType(data.ty_op.ty.toType(), zcu)) return false;
-                if (!checkRef(data.ty_op.operand, zcu)) return false;
+                if (!try checkType(data.ty_op.ty.toType(), zcu)) return false;
+                if (!try checkRef(data.ty_op.operand, zcu)) return false;
             },
 
             .alloc,
             .ret_ptr,
             .c_va_start,
             => {
-                if (!checkType(data.ty, zcu)) return false;
+                if (!try checkType(data.ty, zcu)) return false;
             },
 
             .ptr_add,
@@ -176,17 +180,17 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             .ptr_elem_ptr,
             => {
                 const bin = air.extraData(Air.Bin, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(bin.lhs, zcu)) return false;
-                if (!checkRef(bin.rhs, zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(bin.lhs, zcu)) return false;
+                if (!try checkRef(bin.rhs, zcu)) return false;
             },
 
             .block,
             .loop,
             => {
                 const extra = air.extraData(Air.Block, data.ty_pl.payload);
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkBody(
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkBody(
                     air,
                     @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]),
                     zcu,
@@ -195,8 +199,8 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
 
             .dbg_inline_block => {
                 const extra = air.extraData(Air.DbgInlineBlock, data.ty_pl.payload);
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkBody(
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkBody(
                     air,
                     @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]),
                     zcu,
@@ -236,51 +240,51 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             .c_va_end,
             .set_err_return_trace,
             => {
-                if (!checkRef(data.un_op, zcu)) return false;
+                if (!try checkRef(data.un_op, zcu)) return false;
             },
 
             .br, .switch_dispatch => {
-                if (!checkRef(data.br.operand, zcu)) return false;
+                if (!try checkRef(data.br.operand, zcu)) return false;
             },
 
             .cmp_vector,
             .cmp_vector_optimized,
             => {
                 const extra = air.extraData(Air.VectorCmp, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.lhs, zcu)) return false;
-                if (!checkRef(extra.rhs, zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(extra.lhs, zcu)) return false;
+                if (!try checkRef(extra.rhs, zcu)) return false;
             },
 
             .reduce,
             .reduce_optimized,
             => {
-                if (!checkRef(data.reduce.operand, zcu)) return false;
+                if (!try checkRef(data.reduce.operand, zcu)) return false;
             },
 
             .struct_field_ptr,
             .struct_field_val,
             => {
                 const extra = air.extraData(Air.StructField, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.struct_operand, zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(extra.struct_operand, zcu)) return false;
             },
 
             .shuffle_one => {
                 const unwrapped = air.unwrapShuffleOne(zcu, inst);
-                if (!checkType(unwrapped.result_ty, zcu)) return false;
-                if (!checkRef(unwrapped.operand, zcu)) return false;
+                if (!try checkType(unwrapped.result_ty, zcu)) return false;
+                if (!try checkRef(unwrapped.operand, zcu)) return false;
                 for (unwrapped.mask) |m| switch (m.unwrap()) {
                     .elem => {},
-                    .value => |val| if (!checkVal(.fromInterned(val), zcu)) return false,
+                    .value => |val| if (!try checkVal(.fromInterned(val), zcu)) return false,
                 };
             },
 
             .shuffle_two => {
                 const unwrapped = air.unwrapShuffleTwo(zcu, inst);
-                if (!checkType(unwrapped.result_ty, zcu)) return false;
-                if (!checkRef(unwrapped.operand_a, zcu)) return false;
-                if (!checkRef(unwrapped.operand_b, zcu)) return false;
+                if (!try checkType(unwrapped.result_ty, zcu)) return false;
+                if (!try checkRef(unwrapped.operand_a, zcu)) return false;
+                if (!try checkRef(unwrapped.operand_b, zcu)) return false;
                 // No values to check because there are no comptime-known values other than undef
             },
 
@@ -288,73 +292,73 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             .cmpxchg_strong,
             => {
                 const extra = air.extraData(Air.Cmpxchg, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.ptr, zcu)) return false;
-                if (!checkRef(extra.expected_value, zcu)) return false;
-                if (!checkRef(extra.new_value, zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(extra.ptr, zcu)) return false;
+                if (!try checkRef(extra.expected_value, zcu)) return false;
+                if (!try checkRef(extra.new_value, zcu)) return false;
             },
 
             .aggregate_init => {
                 const ty = data.ty_pl.ty.toType();
                 const elems_len: usize = @intCast(ty.arrayLen(zcu));
                 const elems: []const Air.Inst.Ref = @ptrCast(air.extra.items[data.ty_pl.payload..][0..elems_len]);
-                if (!checkType(ty, zcu)) return false;
+                if (!try checkType(ty, zcu)) return false;
                 if (ty.zigTypeTag(zcu) == .@"struct") {
                     for (elems, 0..) |elem, elem_idx| {
                         if (ty.structFieldIsComptime(elem_idx, zcu)) continue;
-                        if (!checkRef(elem, zcu)) return false;
+                        if (!try checkRef(elem, zcu)) return false;
                     }
                 } else {
                     for (elems) |elem| {
-                        if (!checkRef(elem, zcu)) return false;
+                        if (!try checkRef(elem, zcu)) return false;
                     }
                 }
             },
 
             .union_init => {
                 const extra = air.extraData(Air.UnionInit, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.init, zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(extra.init, zcu)) return false;
             },
 
             .field_parent_ptr => {
                 const extra = air.extraData(Air.FieldParentPtr, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.field_ptr, zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(extra.field_ptr, zcu)) return false;
             },
 
             .atomic_load => {
-                if (!checkRef(data.atomic_load.ptr, zcu)) return false;
+                if (!try checkRef(data.atomic_load.ptr, zcu)) return false;
             },
 
             .prefetch => {
-                if (!checkRef(data.prefetch.ptr, zcu)) return false;
+                if (!try checkRef(data.prefetch.ptr, zcu)) return false;
             },
 
             .vector_store_elem => {
                 const bin = air.extraData(Air.Bin, data.vector_store_elem.payload).data;
-                if (!checkRef(data.vector_store_elem.vector_ptr, zcu)) return false;
-                if (!checkRef(bin.lhs, zcu)) return false;
-                if (!checkRef(bin.rhs, zcu)) return false;
+                if (!try checkRef(data.vector_store_elem.vector_ptr, zcu)) return false;
+                if (!try checkRef(bin.lhs, zcu)) return false;
+                if (!try checkRef(bin.rhs, zcu)) return false;
             },
 
             .runtime_nav_ptr => {
-                if (!checkType(.fromInterned(data.ty_nav.ty), zcu)) return false;
+                if (!try checkType(.fromInterned(data.ty_nav.ty), zcu)) return false;
             },
 
             .select,
             .mul_add,
             => {
                 const bin = air.extraData(Air.Bin, data.pl_op.payload).data;
-                if (!checkRef(data.pl_op.operand, zcu)) return false;
-                if (!checkRef(bin.lhs, zcu)) return false;
-                if (!checkRef(bin.rhs, zcu)) return false;
+                if (!try checkRef(data.pl_op.operand, zcu)) return false;
+                if (!try checkRef(bin.lhs, zcu)) return false;
+                if (!try checkRef(bin.rhs, zcu)) return false;
             },
 
             .atomic_rmw => {
                 const extra = air.extraData(Air.AtomicRmw, data.pl_op.payload).data;
-                if (!checkRef(data.pl_op.operand, zcu)) return false;
-                if (!checkRef(extra.operand, zcu)) return false;
+                if (!try checkRef(data.pl_op.operand, zcu)) return false;
+                if (!try checkRef(extra.operand, zcu)) return false;
             },
 
             .call,
@@ -364,21 +368,21 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
             => {
                 const extra = air.extraData(Air.Call, data.pl_op.payload);
                 const args: []const Air.Inst.Ref = @ptrCast(air.extra.items[extra.end..][0..extra.data.args_len]);
-                if (!checkRef(data.pl_op.operand, zcu)) return false;
-                for (args) |arg| if (!checkRef(arg, zcu)) return false;
+                if (!try checkRef(data.pl_op.operand, zcu)) return false;
+                for (args) |arg| if (!try checkRef(arg, zcu)) return false;
             },
 
             .dbg_var_ptr,
             .dbg_var_val,
             .dbg_arg_inline,
             => {
-                if (!checkRef(data.pl_op.operand, zcu)) return false;
+                if (!try checkRef(data.pl_op.operand, zcu)) return false;
             },
 
             .@"try", .try_cold => {
                 const extra = air.extraData(Air.Try, data.pl_op.payload);
-                if (!checkRef(data.pl_op.operand, zcu)) return false;
-                if (!checkBody(
+                if (!try checkRef(data.pl_op.operand, zcu)) return false;
+                if (!try checkBody(
                     air,
                     @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]),
                     zcu,
@@ -387,9 +391,9 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
 
             .try_ptr, .try_ptr_cold => {
                 const extra = air.extraData(Air.TryPtr, data.ty_pl.payload);
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.data.ptr, zcu)) return false;
-                if (!checkBody(
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkRef(extra.data.ptr, zcu)) return false;
+                if (!try checkBody(
                     air,
                     @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]),
                     zcu,
@@ -398,13 +402,13 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
 
             .cond_br => {
                 const extra = air.extraData(Air.CondBr, data.pl_op.payload);
-                if (!checkRef(data.pl_op.operand, zcu)) return false;
-                if (!checkBody(
+                if (!try checkRef(data.pl_op.operand, zcu)) return false;
+                if (!try checkBody(
                     air,
                     @ptrCast(air.extra.items[extra.end..][0..extra.data.then_body_len]),
                     zcu,
                 )) return false;
-                if (!checkBody(
+                if (!try checkBody(
                     air,
                     @ptrCast(air.extra.items[extra.end + extra.data.then_body_len ..][0..extra.data.else_body_len]),
                     zcu,
@@ -413,29 +417,29 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
 
             .switch_br, .loop_switch_br => {
                 const switch_br = air.unwrapSwitch(inst);
-                if (!checkRef(switch_br.operand, zcu)) return false;
+                if (!try checkRef(switch_br.operand, zcu)) return false;
                 var it = switch_br.iterateCases();
                 while (it.next()) |case| {
-                    for (case.items) |item| if (!checkRef(item, zcu)) return false;
+                    for (case.items) |item| if (!try checkRef(item, zcu)) return false;
                     for (case.ranges) |range| {
-                        if (!checkRef(range[0], zcu)) return false;
-                        if (!checkRef(range[1], zcu)) return false;
+                        if (!try checkRef(range[0], zcu)) return false;
+                        if (!try checkRef(range[1], zcu)) return false;
                     }
-                    if (!checkBody(air, case.body, zcu)) return false;
+                    if (!try checkBody(air, case.body, zcu)) return false;
                 }
-                if (!checkBody(air, it.elseBody(), zcu)) return false;
+                if (!try checkBody(air, it.elseBody(), zcu)) return false;
             },
 
             .assembly => {
                 const extra = air.extraData(Air.Asm, data.ty_pl.payload);
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
+                if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false;
                 // Luckily, we only care about the inputs and outputs, so we don't have to do
                 // the whole null-terminated string dance.
                 const outputs_len = extra.data.flags.outputs_len;
                 const outputs: []const Air.Inst.Ref = @ptrCast(air.extra.items[extra.end..][0..outputs_len]);
                 const inputs: []const Air.Inst.Ref = @ptrCast(air.extra.items[extra.end + outputs_len ..][0..extra.data.inputs_len]);
-                for (outputs) |output| if (output != .none and !checkRef(output, zcu)) return false;
-                for (inputs) |input| if (input != .none and !checkRef(input, zcu)) return false;
+                for (outputs) |output| if (output != .none and !try checkRef(output, zcu)) return false;
+                for (inputs) |input| if (input != .none and !try checkRef(input, zcu)) return false;
             },
 
             .trap,
@@ -459,7 +463,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
     return true;
 }
 
-fn checkRef(ref: Air.Inst.Ref, zcu: *Zcu) bool {
+fn checkRef(ref: Air.Inst.Ref, zcu: *Zcu) Allocator.Error!bool {
     const ip_index = ref.toInterned() orelse {
         // This operand refers back to a previous instruction.
         // We have already checked that instruction's type.
@@ -469,11 +473,11 @@ fn checkRef(ref: Air.Inst.Ref, zcu: *Zcu) bool {
     return checkVal(Value.fromInterned(ip_index), zcu);
 }
 
-pub fn checkVal(val: Value, zcu: *Zcu) bool {
+pub fn checkVal(val: Value, zcu: *Zcu) Allocator.Error!bool {
     const ty = val.typeOf(zcu);
-    if (!checkType(ty, zcu)) return false;
+    if (!try checkType(ty, zcu)) return false;
     if (val.isUndef(zcu)) return true;
-    if (ty.toIntern() == .type_type and !checkType(val.toType(), zcu)) return false;
+    if (ty.toIntern() == .type_type and !try checkType(val.toType(), zcu)) return false;
     // Check for lazy values
     switch (zcu.intern_pool.indexToKey(val.toIntern())) {
         .int => |int| switch (int.storage) {
@@ -486,7 +490,7 @@ pub fn checkVal(val: Value, zcu: *Zcu) bool {
     }
 }
 
-pub fn checkType(ty: Type, zcu: *Zcu) bool {
+pub fn checkType(ty: Type, zcu: *Zcu) Allocator.Error!bool {
     const ip = &zcu.intern_pool;
     if (ty.isGenericPoison()) return true;
     return switch (ty.zigTypeTag(zcu)) {
@@ -522,14 +526,17 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool {
             const info = zcu.typeToFunc(ty).?;
             for (0..info.param_types.len) |i| {
                 const param_ty = info.param_types.get(ip)[i];
-                if (!checkType(Type.fromInterned(param_ty), zcu)) return false;
+                if (!try checkType(Type.fromInterned(param_ty), zcu)) return false;
             }
             return checkType(Type.fromInterned(info.return_type), zcu);
         },
         .@"struct" => switch (ip.indexToKey(ty.toIntern())) {
             .struct_type => {
                 if (tls_resolve_pt) |pt| {
-                    ty.resolveFully(pt) catch return false;
+                    ty.resolveFully(pt) catch |e| switch (e) {
+                        error.OutOfMemory => |oom| return oom,
+                        error.AnalysisFail => return false,
+                    };
                     return true;
                 }
                 const struct_obj = zcu.typeToStruct(ty).?;
@@ -543,7 +550,7 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool {
                     const field_is_comptime = tuple.values.get(ip)[i] != .none;
                     if (field_is_comptime) continue;
                     const field_ty = tuple.types.get(ip)[i];
-                    if (!checkType(Type.fromInterned(field_ty), zcu)) return false;
+                    if (!try checkType(Type.fromInterned(field_ty), zcu)) return false;
                 }
                 return true;
             },
@@ -551,7 +558,10 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool {
         },
         .@"union" => {
             if (tls_resolve_pt) |pt| {
-                ty.resolveFully(pt) catch return false;
+                ty.resolveFully(pt) catch |e| switch (e) {
+                    error.OutOfMemory => |oom| return oom,
+                    error.AnalysisFail => return false,
+                };
                 return true;
             }
             return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved;
diff --git a/src/Compilation.zig b/src/Compilation.zig
index b0519296e51b..bf0f2a9dc2da 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -3302,12 +3302,13 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE
 
 fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void {
     const ip = &zcu.intern_pool;
-    const n: u32 = if (comp.llvm_codegen_threads > 1) comp.llvm_codegen_threads else 16;
+    const n: u32 = @min(if (comp.llvm_codegen_threads > 1) comp.llvm_codegen_threads else 16, 256);
     var counts = [_]u32{0} ** 256;
     var top_file = [_]?*Zcu.File{null} ** 256;
     var top_file_count = [_]u32{0} ** 256;
 
-    var per_file = std.AutoHashMap(*Zcu.File, u32).init(comp.gpa);
+    const PerFileKey = struct { file: *Zcu.File, shard: u8 };
+    var per_file = std.AutoHashMap(PerFileKey, u32).init(comp.gpa);
     defer per_file.deinit();
 
     const total_navs = ip.navCount();
@@ -3324,7 +3325,7 @@ fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void {
         const shard: u8 = @intCast(std.hash.Wyhash.hash(0, fqn) % n);
         counts[shard] += 1;
         const file = zcu.fileByIndex(nav.srcInst(ip).resolveFile(ip));
-        const gop = per_file.getOrPut(file) catch continue;
+        const gop = per_file.getOrPut(.{ .file = file, .shard = shard }) catch continue;
         if (!gop.found_existing) gop.value_ptr.* = 0;
         gop.value_ptr.* += 1;
         if (gop.value_ptr.* > top_file_count[shard]) {
@@ -3469,7 +3470,7 @@ fn flush(
                     base_path_slice;
 
                 for (0..num_threads) |i| {
-                    list[i] = (try std.fmt.allocPrintSentinel(arena, "{s}.{d}.o", .{base_name, i}, 0)).ptr;
+                    list[i] = (try std.fmt.allocPrintSentinel(arena, "{s}.{d}.o", .{ base_name, i }, 0)).ptr;
                 }
                 break :blk list;
             } else null;
@@ -5305,12 +5306,18 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
             // body would leave a dangling cross-shard `__N<nav>` undef.
             // Force-resolve via `resolveTypesFully`, which blocks on the
             // claimOrWait-gated resolution; drop only if that errors.
+            Zcu.tls_retry_loop = null;
             const types_ok: bool = if (zcu.parallel_sema) ok: {
                 const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
                 defer pt.deactivate();
-                break :ok air.resolveTypesFully(pt);
+                break :ok try air.resolveTypesFully(pt);
             } else air.typesFullyResolved(zcu);
             if (!types_ok) {
+                if (Zcu.tls_retry_loop != null) {
+                    Zcu.tls_retry_loop = null;
+                    try comp.queueJob(.{ .codegen_func = func });
+                    return;
+                }
                 // Type resolution failed in a way which affects this function. This is a transitive
                 // failure, but it doesn't need recording, because this function semantically depends
                 // on the failed type, so when it is changed the function is updated.
@@ -5441,7 +5448,10 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
 
                 // Check if this is a test function.
                 const ip = &pt.zcu.intern_pool;
-                if (!pt.zcu.test_functions.contains(nav)) {
+                pt.zcu.test_functions_mutex.lock();
+                const is_test = pt.zcu.test_functions.contains(nav);
+                pt.zcu.test_functions_mutex.unlock();
+                if (!is_test) {
                     break :queue_test_analysis;
                 }
 
diff --git a/src/InternPool.zig b/src/InternPool.zig
index c5fe978be5fc..768b0b6adcce 100644
--- a/src/InternPool.zig
+++ b/src/InternPool.zig
@@ -3895,8 +3895,15 @@ pub const LoadedStructType = struct {
         const extra_mutex = &ip.getLocal(s.tid).mutate.extra.mutex;
         extra_mutex.lock();
         defer extra_mutex.unlock();
-        @memcpy(s.field_types.get(ip), types);
+        if (types.len == 0) return;
+        const field_types_ptr = s.field_types.get(ip);
+        @memcpy(field_types_ptr[0 .. types.len - 1], types[0 .. types.len - 1]);
         if (aligns) |a| if (s.field_aligns.len != 0) @memcpy(s.field_aligns.get(ip), a);
+        // Release-store the last slot so the unlocked `haveFieldTypes`
+        // acquire fast-path synchronises-with this and sees all preceding
+        // type/align slot writes; readers must never observe a non-.none
+        // last slot via plain memcpy with no happens-before.
+        @atomicStore(InternPool.Index, &field_types_ptr[types.len - 1], types[types.len - 1], .release);
     }
 
     pub fn fieldAlign(s: LoadedStructType, ip: *const InternPool, i: usize) Alignment {
@@ -7871,9 +7878,15 @@ pub const wip_namespace_sentinel: u32 = std.math.maxInt(u32);
 /// `NamespaceIndex` (0 is a valid index).
 pub const cancelled_namespace_sentinel: u32 = std.math.maxInt(u32) - 1;
 
-/// Spin until `ty`'s namespace slot is no longer the wip sentinel.
-pub fn awaitNamespaceTypeFinished(ip: *const InternPool, ty: Index) void {
-    const ns_idx = ip.namespaceTypeNamespaceExtraIndex(ty) orelse return;
+pub const NamespaceTypeAwaitResult = enum { finished, cancelled };
+
+/// Spin until `ty`'s namespace slot is no longer the wip sentinel. Returns
+/// `.cancelled` if the wip owner invoked `cancel` (slot now holds
+/// `cancelled_namespace_sentinel`); the caller must not use `ty` and should
+/// retry the originating `get*Type` call, which will skip the now-`.removed`
+/// map entry and allocate fresh.
+pub fn awaitNamespaceTypeFinished(ip: *const InternPool, ty: Index) NamespaceTypeAwaitResult {
+    const ns_idx = ip.namespaceTypeNamespaceExtraIndex(ty) orelse return .finished;
     const unwrapped = ty.unwrap(ip);
     while (true) {
         // Re-acquire the shared view each iteration: the owning tid may
@@ -7881,8 +7894,12 @@ pub fn awaitNamespaceTypeFinished(ip: *const InternPool, ty: Index) void {
         // would leave a cached slot pointer dangling at the old buffer.
         const extra = ip.getLocalShared(unwrapped.tid).extra.acquire();
         const slot: *const u32 = &extra.view().items(.@"0")[ns_idx];
-        if (@atomicLoad(u32, slot, .acquire) != wip_namespace_sentinel) return;
-        std.atomic.spinLoopHint();
+        const loaded = @atomicLoad(u32, slot, .acquire);
+        if (loaded == wip_namespace_sentinel) {
+            std.atomic.spinLoopHint();
+            continue;
+        }
+        return if (loaded == cancelled_namespace_sentinel) .cancelled else .finished;
     }
 }
 
@@ -7967,6 +7984,7 @@ fn getOrPutKeyInner(
         const index = entry.value;
         if (index == .none) break;
         if (entry.hash != hash) continue;
+        if (index.unwrap(ip).getTag(ip) == .removed) continue;
         if (ip.indexToKey(index).eql(key, ip)) {
             if (!prelocked) shard.mutate.map.mutex.unlock();
             return .{ .existing = index };
@@ -11829,8 +11847,14 @@ pub fn getNav(ip: *const InternPool, index: Nav.Index) Nav {
     // with a new payload (or vice versa).
     while (true) {
         const b1 = @atomicLoad(Nav.Repr.Bits, bits_ptr, .acquire);
-        repr.type_or_val = @atomicLoad(InternPool.Index, tov_ptr, .unordered);
-        repr.@"linksection" = @atomicLoad(OptionalNullTerminatedString, ls_ptr, .unordered);
+        // Payload loads must be .acquire (not .unordered): on ARM64 a plain
+        // LDR po-before LDAR (b2) is not barrier-ordered-before it and may
+        // reorder past b2, yielding a torn read that still passes b1 == b2.
+        // LDAR-before-LDAR is ordered, so .acquire here closes the window
+        // (the `@fence(.acquire)` builtin is gone; this is the fence-free
+        // equivalent for the seqlock read side).
+        repr.type_or_val = @atomicLoad(InternPool.Index, tov_ptr, .acquire);
+        repr.@"linksection" = @atomicLoad(OptionalNullTerminatedString, ls_ptr, .acquire);
         const b2 = @atomicLoad(Nav.Repr.Bits, bits_ptr, .acquire);
         if (!b1.writing and @as(u16, @bitCast(b1)) == @as(u16, @bitCast(b2))) {
             repr.bits = b2;
diff --git a/src/Sema.zig b/src/Sema.zig
index fc8bfe47deee..88a5c60e49d0 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -2999,9 +2999,15 @@ fn zirStructDecl(
             .captures = captures,
         } },
     };
-    const wip_ty = switch (try ip.getStructType(gpa, pt.tid, struct_init, false)) {
+    const wip_ty = gop: while (true) switch (try ip.getStructType(gpa, pt.tid, struct_init, false)) {
         .existing => |ty| {
-            zcu.awaitNamespaceTypeFinished(ty);
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                // The wip owner cancelled; `ty`'s map slot is now `.removed`.
+                // Re-run getStructType so we either dedup to a newer entry or
+                // claim a fresh wip ourselves.
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
@@ -3012,7 +3018,7 @@ fn zirStructDecl(
             try sema.addTypeReferenceEntry(src, new_ty);
             return Air.internedToRef(new_ty);
         },
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
     };
     var published = false;
     errdefer if (!published) wip_ty.cancel(ip, pt.tid);
@@ -3244,9 +3250,12 @@ fn zirEnumDecl(
             .captures = captures,
         } },
     };
-    const wip_ty = switch (try ip.getEnumType(gpa, pt.tid, enum_init, false)) {
+    const wip_ty = gop: while (true) switch (try ip.getEnumType(gpa, pt.tid, enum_init, false)) {
         .existing => |ty| {
-            zcu.awaitNamespaceTypeFinished(ty);
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
@@ -3266,7 +3275,7 @@ fn zirEnumDecl(
 
             return Air.internedToRef(new_ty);
         },
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
     };
 
     // Once this is `true`, we will not delete the decl or type even upon failure, since we
@@ -3410,9 +3419,12 @@ fn zirUnionDecl(
             .captures = captures,
         } },
     };
-    const wip_ty = switch (try ip.getUnionType(gpa, pt.tid, union_init, false)) {
+    const wip_ty = gop: while (true) switch (try ip.getUnionType(gpa, pt.tid, union_init, false)) {
         .existing => |ty| {
-            zcu.awaitNamespaceTypeFinished(ty);
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
@@ -3423,7 +3435,7 @@ fn zirUnionDecl(
             try sema.addTypeReferenceEntry(src, new_ty);
             return Air.internedToRef(new_ty);
         },
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
     };
     var published = false;
     errdefer if (!published) wip_ty.cancel(ip, pt.tid);
@@ -3510,9 +3522,12 @@ fn zirOpaqueDecl(
             .captures = captures,
         } },
     };
-    const wip_ty = switch (try ip.getOpaqueType(gpa, pt.tid, opaque_init)) {
+    const wip_ty = gop: while (true) switch (try ip.getOpaqueType(gpa, pt.tid, opaque_init)) {
         .existing => |ty| {
-            zcu.awaitNamespaceTypeFinished(ty);
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
             // up on e.g. changed comptime decls.
             try pt.ensureNamespaceUpToDate(Type.fromInterned(ty).getNamespaceIndex(zcu));
@@ -3521,7 +3536,7 @@ fn zirOpaqueDecl(
             try sema.addTypeReferenceEntry(src, ty);
             return Air.internedToRef(ty);
         },
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
     };
     var published = false;
     errdefer if (!published) wip_ty.cancel(ip, pt.tid);
@@ -19845,7 +19860,7 @@ fn structInitAnon(
         break :hash hasher.final();
     };
     const tracked_inst = try block.trackZir(inst);
-    const struct_ty = switch (try ip.getStructType(gpa, pt.tid, .{
+    const struct_ty = gop: while (true) switch (try ip.getStructType(gpa, pt.tid, .{
         .layout = .auto,
         .fields_len = extra_data.fields_len,
         .known_non_opv = false,
@@ -19859,7 +19874,7 @@ fn structInitAnon(
             .type_hash = type_hash,
         } },
     }, false)) {
-        .wip => |wip| ty: {
+        .wip => |wip| {
             errdefer wip.cancel(ip, pt.tid);
             const type_name = try sema.createTypeName(block, .anon, "struct", inst, wip.index);
             wip.setName(ip, type_name.name, type_name.nav);
@@ -19891,14 +19906,17 @@ fn structInitAnon(
                 try zcu.comp.queueJob(.{ .link_type = wip.index });
             }
             if (zcu.comp.debugIncremental()) try zcu.incremental_debug_state.newType(zcu, wip.index);
-            break :ty wip.index;
+            break :gop wip.index;
+        },
+        // Under parallel Sema, `.existing` may dedup to a type whose `.wip` owner
+        // has not yet run `setFieldTypesAll`/`finish`; spin so `aggregateValue`'s
+        // canonicalization sees populated field_types. If the owner cancelled,
+        // retry getStructType — the tombstoned slot is skipped.
+        .existing => |ty| switch (zcu.awaitNamespaceTypeFinished(ty)) {
+            .cancelled => continue :gop,
+            .finished => break :gop ty,
         },
-        .existing => |ty| ty,
     };
-    // Under parallel Sema, `.existing` may dedup to a type whose `.wip` owner
-    // has not yet run `setFieldTypesAll`/`finish`; spin so `aggregateValue`'s
-    // canonicalization sees populated field_types.
-    zcu.awaitNamespaceTypeFinished(struct_ty);
     try sema.declareDependency(.{ .interned = struct_ty });
     try sema.addTypeReferenceEntry(src, struct_ty);
 
@@ -20886,16 +20904,20 @@ fn zirReify(
                 return sema.fail(block, src, "reified opaque must have no decls", .{});
             }
 
-            const wip_ty = switch (try ip.getOpaqueType(gpa, pt.tid, .{
+            const wip_ty = gop: while (true) switch (try ip.getOpaqueType(gpa, pt.tid, .{
                 .key = .{ .reified = .{
-                    .zir_index = try block.trackZir(inst),
+                    .zir_index = tracked_inst,
                 } },
             })) {
                 .existing => |ty| {
+                    switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                        .cancelled => continue :gop,
+                        .finished => {},
+                    }
                     try sema.addTypeReferenceEntry(src, ty);
                     return Air.internedToRef(ty);
                 },
-                .wip => |wip| wip,
+                .wip => |wip| break :gop wip,
             };
             var published = false;
             errdefer if (!published) wip_ty.cancel(ip, pt.tid);
@@ -21094,18 +21116,23 @@ fn reifyEnum(
     }
 
     const tracked_inst = try block.trackZir(inst);
+    const type_hash = hasher.final();
 
-    const wip_ty = switch (try ip.getEnumType(gpa, pt.tid, .{
+    const wip_ty = gop: while (true) switch (try ip.getEnumType(gpa, pt.tid, .{
         .has_values = true,
         .tag_mode = if (is_exhaustive) .explicit else .nonexhaustive,
         .fields_len = fields_len,
         .key = .{ .reified = .{
             .zir_index = tracked_inst,
-            .type_hash = hasher.final(),
+            .type_hash = type_hash,
         } },
     }, false)) {
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
         .existing => |ty| {
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             try sema.declareDependency(.{ .interned = ty });
             try sema.addTypeReferenceEntry(src, ty);
             return Air.internedToRef(ty);
@@ -21243,8 +21270,9 @@ fn reifyUnion(
     }
 
     const tracked_inst = try block.trackZir(inst);
+    const type_hash = hasher.final();
 
-    const wip_ty = switch (try ip.getUnionType(gpa, pt.tid, .{
+    const wip_ty = gop: while (true) switch (try ip.getUnionType(gpa, pt.tid, .{
         .flags = .{
             .layout = layout,
             .status = .none,
@@ -21268,11 +21296,15 @@ fn reifyUnion(
         .field_aligns = &.{}, // set later
         .key = .{ .reified = .{
             .zir_index = tracked_inst,
-            .type_hash = hasher.final(),
+            .type_hash = type_hash,
         } },
     }, false)) {
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
         .existing => |ty| {
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             try sema.declareDependency(.{ .interned = ty });
             try sema.addTypeReferenceEntry(src, ty);
             return Air.internedToRef(ty);
@@ -21608,8 +21640,9 @@ fn reifyStruct(
     }
 
     const tracked_inst = try block.trackZir(inst);
+    const type_hash = hasher.final();
 
-    const wip_ty = switch (try ip.getStructType(gpa, pt.tid, .{
+    const wip_ty = gop: while (true) switch (try ip.getStructType(gpa, pt.tid, .{
         .layout = layout,
         .fields_len = fields_len,
         .known_non_opv = false,
@@ -21620,11 +21653,15 @@ fn reifyStruct(
         .inits_resolved = true,
         .key = .{ .reified = .{
             .zir_index = tracked_inst,
-            .type_hash = hasher.final(),
+            .type_hash = type_hash,
         } },
     }, false)) {
-        .wip => |wip| wip,
+        .wip => |wip| break :gop wip,
         .existing => |ty| {
+            switch (zcu.awaitNamespaceTypeFinished(ty)) {
+                .cancelled => continue :gop,
+                .finished => {},
+            }
             try sema.declareDependency(.{ .interned = ty });
             try sema.addTypeReferenceEntry(src, ty);
             return Air.internedToRef(ty);
@@ -29185,9 +29222,12 @@ fn coerceExtra(
         try in_memory_result.report(sema, inst_src, msg);
 
         // Add notes about function return type
-        if (opts.is_ret and
-            !zcu.test_functions.contains(zcu.funcInfo(sema.func_index).owner_nav))
-        {
+        const is_test_fn = if (opts.is_ret) blk: {
+            zcu.test_functions_mutex.lock();
+            defer zcu.test_functions_mutex.unlock();
+            break :blk zcu.test_functions.contains(zcu.funcInfo(sema.func_index).owner_nav);
+        } else false;
+        if (opts.is_ret and !is_test_fn) {
             const ret_ty_src: LazySrcLoc = .{
                 .base_node_inst = ip.getNav(zcu.funcInfo(sema.func_index).owner_nav).srcInst(ip),
                 .offset = .{ .node_offset_fn_type_ret_ty = .zero },
@@ -35592,13 +35632,6 @@ fn structFields(
 
     struct_type.setFieldTypesAlignsAll(ip, tmp_types, if (any_aligned) tmp_aligns else null);
 
-    // Re-store the last field type with release so the unlocked
-    // `haveFieldTypes` fast-path acquire sees all preceding name/type
-    // slot writes from this loop.
-    if (struct_type.field_types.len > 0) {
-        const types = struct_type.field_types.get(ip);
-        @atomicStore(InternPool.Index, &types[types.len - 1], types[types.len - 1], .release);
-    }
     struct_type.clearFieldTypesWip(ip);
     if (!any_inits) struct_type.setHaveFieldInits(ip);
 
diff --git a/src/Type.zig b/src/Type.zig
index 05386f95f885..1659199d849f 100644
--- a/src/Type.zig
+++ b/src/Type.zig
@@ -3008,7 +3008,10 @@ pub fn getNamespaceIndex(ty: Type, zcu: *Zcu) InternPool.NamespaceIndex {
 /// Returns null if the type has no namespace.
 pub fn getNamespace(ty: Type, zcu: *Zcu) InternPool.OptionalNamespaceIndex {
     const ip = &zcu.intern_pool;
-    zcu.awaitNamespaceTypeFinished(ty.toIntern());
+    // Callers reach here only with indices that have already passed the
+    // `.existing` retry loop in Sema (or are owned by this thread via
+    // `tls_wip_types`), so `.cancelled` is not expected.
+    _ = zcu.awaitNamespaceTypeFinished(ty.toIntern());
     return switch (ip.indexToKey(ty.toIntern())) {
         .opaque_type => ip.loadOpaqueType(ty.toIntern()).namespace.toOptional(),
         .struct_type => ip.loadStructType(ty.toIntern()).namespace.toOptional(),
@@ -3142,7 +3145,7 @@ pub fn enumFieldIndex(ty: Type, field_name: InternPool.NullTerminatedString, zcu
     const ip = &zcu.intern_pool;
     // The `.existing` dedup may return an enum whose `WipEnumType` owner is
     // still populating names; spin until prepare() so the lookup sees them.
-    Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
+    _ = Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
     const enum_type = ip.loadEnumType(ty.toIntern());
     return enum_type.nameIndex(ip, field_name);
 }
@@ -3152,7 +3155,7 @@ pub fn enumFieldIndex(ty: Type, field_name: InternPool.NullTerminatedString, zcu
 /// declaration order, or `null` if `enum_tag` does not match any field.
 pub fn enumTagFieldIndex(ty: Type, enum_tag: Value, zcu: *const Zcu) ?u32 {
     const ip = &zcu.intern_pool;
-    Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
+    _ = Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
     const enum_type = ip.loadEnumType(ty.toIntern());
     const int_tag = switch (ip.indexToKey(enum_tag.toIntern())) {
         .int => enum_tag.toIntern(),
@@ -3884,7 +3887,7 @@ fn resolveStructInner(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
-    zcu.awaitNamespaceTypeFinished(ty.toIntern());
+    _ = zcu.awaitNamespaceTypeFinished(ty.toIntern());
 
     const ip = &zcu.intern_pool;
     const struct_obj = zcu.typeToStruct(ty).?;
@@ -3979,7 +3982,7 @@ fn resolveUnionInner(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
-    zcu.awaitNamespaceTypeFinished(ty.toIntern());
+    _ = zcu.awaitNamespaceTypeFinished(ty.toIntern());
 
     const ip = &zcu.intern_pool;
     const union_obj = zcu.typeToUnion(ty).?;
diff --git a/src/Zcu.zig b/src/Zcu.zig
index c000e5000df6..be7bedc06138 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -3678,13 +3678,13 @@ pub fn wipTypeExit(zcu: *Zcu, ty: InternPool.Index) void {
     _ = tls_wip_types.swapRemove(ty);
 }
 
-pub fn awaitNamespaceTypeFinished(zcu: *Zcu, ty: InternPool.Index) void {
-    awaitNamespaceTypeFinishedConst(zcu, ty);
+pub fn awaitNamespaceTypeFinished(zcu: *Zcu, ty: InternPool.Index) InternPool.NamespaceTypeAwaitResult {
+    return awaitNamespaceTypeFinishedConst(zcu, ty);
 }
-pub fn awaitNamespaceTypeFinishedConst(zcu: *const Zcu, ty: InternPool.Index) void {
-    if (!zcu.parallel_sema) return;
-    if (tls_wip_types.contains(ty)) return;
-    zcu.intern_pool.awaitNamespaceTypeFinished(ty);
+pub fn awaitNamespaceTypeFinishedConst(zcu: *const Zcu, ty: InternPool.Index) InternPool.NamespaceTypeAwaitResult {
+    if (!zcu.parallel_sema) return .finished;
+    if (tls_wip_types.contains(ty)) return .finished;
+    return zcu.intern_pool.awaitNamespaceTypeFinished(ty);
 }
 
 /// Try to claim `unit` for analysis on behalf of `tid`. Returns:
@@ -3742,7 +3742,6 @@ pub fn releaseClaim(zcu: *Zcu, unit: AnalUnit) void {
     zcu.sema_claim_cond.broadcast();
 }
 
-
 /// Under parallel Sema, `analysis_in_progress` is per-OS-thread (lock-free).
 threadlocal var tls_aip: std.AutoArrayHashMapUnmanaged(AnalUnit, void) = .empty;
 /// Set by `ensureNavResolved` when a dependency loop is detected under
@@ -3854,18 +3853,22 @@ pub fn deleteUnitReferences(zcu: *Zcu, anal_unit: AnalUnit) void {
                 // The same inline frame could be used multiple times by one unit. We need to
                 // detect this case to avoid adding it to `free_inline_reference_frames` more
                 // than once. We do that by setting `parent` to itself as a marker.
-                if (inline_frame.ptr(zcu).parent == inline_frame.toOptional()) break;
-                {
-                    zcu.inline_ref_mutex.lock();
-                    defer zcu.inline_ref_mutex.unlock();
-                    zcu.free_inline_reference_frames.append(gpa, inline_frame) catch {
-                        // This space will be reused eventually, so we need not propagate this error.
-                        // Just leak it for now, and let GC reclaim it later on.
-                        break :unit_refs;
-                    };
-                }
-                opt_inline_frame = inline_frame.ptr(zcu).parent;
-                inline_frame.ptr(zcu).parent = inline_frame.toOptional(); // signal to code above
+                // All accesses through `inline_frame.ptr(zcu)` must hold `inline_ref_mutex`
+                // because a concurrent `addInlineReferenceFrame` may realloc the backing
+                // array, and once we append to the free list the slot may be popped and
+                // overwritten.
+                zcu.inline_ref_mutex.lock();
+                defer zcu.inline_ref_mutex.unlock();
+                const frame_ptr = inline_frame.ptr(zcu);
+                if (frame_ptr.parent == inline_frame.toOptional()) break;
+                const parent = frame_ptr.parent;
+                frame_ptr.parent = inline_frame.toOptional(); // signal to code above
+                zcu.free_inline_reference_frames.append(gpa, inline_frame) catch {
+                    // This space will be reused eventually, so we need not propagate this error.
+                    // Just leak it for now, and let GC reclaim it later on.
+                    break :unit_refs;
+                };
+                opt_inline_frame = parent;
             }
         }
     }
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index 00bb6b89f7dc..5c1ad65091fd 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -644,14 +644,24 @@ pub fn ensureMemoizedStateUpToDate(pt: Zcu.PerThread, stage: InternPool.Memoized
         if (@atomicLoad(InternPool.Index, zcu.builtin_decl_values.getPtrConst(to_check), .acquire) != .none) return;
     }
 
-    switch (try zcu.claimOrWait(unit)) {
-        .claimed => {},
+    claim: while (true) switch (try zcu.claimOrWait(unit)) {
+        .claimed => break :claim,
         .recursed => return error.AnalysisFail,
         .done => {
             if (zcu.anyAnalysisFailed(unit)) return error.AnalysisFail;
-            return;
+            // The previous holder may have released its claim via a retry-abort
+            // (yield-and-requeue) without actually populating the stage. Re-check
+            // the sentinel decl and loop back to claim if not.
+            const to_check: Zcu.BuiltinDecl = switch (stage) {
+                .main => .@"Type.Declaration",
+                .panic => .@"panic.noreturnReturned",
+                .va_list => .VaList,
+                .assembly => .@"assembly.Clobbers",
+            };
+            if (zcu.builtin_decl_values.get(to_check) != .none) return;
+            continue :claim;
         },
-    }
+    };
     defer zcu.releaseClaim(unit);
 
     const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index f886f7b5e8ac..1668bf67270f 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -580,9 +580,9 @@ pub const PartitionSet = struct {
             var shard_opts = options;
             shard_opts.bin_path = list[i];
             shard_opts.bin_path_list = null;
-            shard_opts.asm_path = null;
             if (i != 0) {
                 shard_opts.time_report = null;
+                shard_opts.asm_path = null;
                 shard_opts.pre_ir_path = null;
                 shard_opts.pre_bc_path = null;
                 shard_opts.post_ir_path = null;
@@ -725,31 +725,31 @@ pub const Object = struct {
                     break :blk try zcu.main_mod.root.toAbsolute(comp.dirs, arena);
                 };
 
-            const debug_file = try builder.debugFile(
-                try builder.metadataString(comp.root_name),
-                try builder.metadataString(compile_unit_dir),
-            );
+                const debug_file = try builder.debugFile(
+                    try builder.metadataString(comp.root_name),
+                    try builder.metadataString(compile_unit_dir),
+                );
 
-            const debug_enums_fwd_ref = try builder.debugForwardReference();
-            const debug_globals_fwd_ref = try builder.debugForwardReference();
-
-            const debug_compile_unit = try builder.debugCompileUnit(
-                debug_file,
-                // Don't use the version string here; LLVM misparses it when it
-                // includes the git revision.
-                try builder.metadataStringFmt("zig {d}.{d}.{d}", .{
-                    build_options.semver.major,
-                    build_options.semver.minor,
-                    build_options.semver.patch,
-                }),
-                debug_enums_fwd_ref,
-                debug_globals_fwd_ref,
-                .{ .optimized = comp.root_mod.optimize_mode != .Debug },
-            );
+                const debug_enums_fwd_ref = try builder.debugForwardReference();
+                const debug_globals_fwd_ref = try builder.debugForwardReference();
+
+                const debug_compile_unit = try builder.debugCompileUnit(
+                    debug_file,
+                    // Don't use the version string here; LLVM misparses it when it
+                    // includes the git revision.
+                    try builder.metadataStringFmt("zig {d}.{d}.{d}", .{
+                        build_options.semver.major,
+                        build_options.semver.minor,
+                        build_options.semver.patch,
+                    }),
+                    debug_enums_fwd_ref,
+                    debug_globals_fwd_ref,
+                    .{ .optimized = comp.root_mod.optimize_mode != .Debug },
+                );
 
-            try builder.metadataNamed(try builder.metadataString("llvm.dbg.cu"), &.{debug_compile_unit});
-            break :debug_info .{ debug_compile_unit, debug_enums_fwd_ref, debug_globals_fwd_ref };
-        } else .{.none} ** 3;
+                try builder.metadataNamed(try builder.metadataString("llvm.dbg.cu"), &.{debug_compile_unit});
+                break :debug_info .{ debug_compile_unit, debug_enums_fwd_ref, debug_globals_fwd_ref };
+            } else .{.none} ** 3;
 
         const obj = try arena.create(Object);
         obj.* = .{
@@ -2890,9 +2890,9 @@ pub const Object = struct {
 
                 const full_fields: [2]Builder.Metadata =
                     if (layout.tag_align.compare(.gte, layout.payload_align))
-                    .{ debug_tag_type, debug_payload_type }
-                else
-                    .{ debug_payload_type, debug_tag_type };
+                        .{ debug_tag_type, debug_payload_type }
+                    else
+                        .{ debug_payload_type, debug_tag_type };
 
                 const debug_tagged_union_type = try o.builder.debugStructType(
                     try o.builder.metadataString(name),
@@ -9921,7 +9921,7 @@ pub const FuncGen = struct {
 
         if (llvm_dest_ty.isStruct(&o.builder) or
             ((operand_ty.zigTypeTag(zcu) == .vector or inst_ty.zigTypeTag(zcu) == .vector) and
-            operand_ty.bitSize(zcu) != inst_ty.bitSize(zcu)))
+                operand_ty.bitSize(zcu) != inst_ty.bitSize(zcu)))
         {
             // Both our operand and our result are values, not pointers,
             // but LLVM won't let us bitcast struct values or vectors with padding bits.
diff --git a/src/link/Lld.zig b/src/link/Lld.zig
index 528d297c091a..62cce7f237fa 100644
--- a/src/link/Lld.zig
+++ b/src/link/Lld.zig
@@ -396,9 +396,15 @@ fn coffLink(lld: *Lld, arena: Allocator) !void {
             if (comp.c_object_table.count() != 0)
                 break :blk comp.c_object_table.keys()[0].status.success.object_path;
 
-            if (zcu_obj_paths.len > 0)
+            if (zcu_obj_paths.len == 1)
                 break :blk zcu_obj_paths[0];
 
+            if (zcu_obj_paths.len > 1)
+                return comp.link_diags.fail(
+                    "sharded codegen with build-obj is not supported for COFF; pass --llvm-no-merge-shards or --llvm-codegen-threads=1",
+                    .{},
+                );
+
             // TODO I think this is unreachable. Audit this situation when solving the above TODO
             // regarding eliding redundant object -> object transformations.
             return error.NoObjectsToLink;
@@ -1401,9 +1407,15 @@ fn wasmLink(lld: *Lld, arena: Allocator) !void {
             if (comp.c_object_table.count() != 0)
                 break :blk comp.c_object_table.keys()[0].status.success.object_path;
 
-            if (zcu_obj_paths.len > 0)
+            if (zcu_obj_paths.len == 1)
                 break :blk zcu_obj_paths[0];
 
+            if (zcu_obj_paths.len > 1)
+                return comp.link_diags.fail(
+                    "sharded codegen with build-obj is not supported for WebAssembly; pass --llvm-no-merge-shards or --llvm-codegen-threads=1",
+                    .{},
+                );
+
             // TODO I think this is unreachable. Audit this situation when solving the above TODO
             // regarding eliding redundant object -> object transformations.
             return error.NoObjectsToLink;

From a293e78d705724ef77a8605c1b5a44ad55b70af1 Mon Sep 17 00:00:00 2001
From: Alistair Smith <hi@alistair.sh>
Date: Thu, 9 Apr 2026 16:49:17 -0700
Subject: [PATCH 03/15] test: thread -Dllvm-codegen-threads through behavior
 matrix; std.mem.len comptime guard

- build.zig + test/tests.zig: add llvm_codegen_threads option, set on all
  addModuleTests targets when LLVM backend is used.
- lib/std/mem.zig: gate strlen/wcslen extern fast-path on !@inComptime()
  (extern call at comptime is invalid; pre-existing fork bug).
---
 build.zig      | 4 ++++
 test/tests.zig | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/build.zig b/build.zig
index ea743617405f..5054244a2559 100644
--- a/build.zig
+++ b/build.zig
@@ -98,6 +98,7 @@ pub fn build(b: *std.Build) !void {
     const skip_macos = b.option(bool, "skip-macos", "Main test suite skips targets with macos OS") orelse false;
     const skip_linux = b.option(bool, "skip-linux", "Main test suite skips targets with linux OS") orelse false;
     const skip_llvm = b.option(bool, "skip-llvm", "Main test suite skips targets that use LLVM backend") orelse false;
+    const llvm_codegen_threads = b.option(u32, "llvm-codegen-threads", "Number of LLVM codegen threads to use for module tests") orelse 0;
 
     const only_install_lib_files = b.option(bool, "lib-files-only", "Only install library files") orelse false;
 
@@ -467,6 +468,7 @@ pub fn build(b: *std.Build) !void {
         .skip_linux = skip_linux,
         .skip_llvm = skip_llvm,
         .skip_libc = skip_libc,
+        .llvm_codegen_threads = llvm_codegen_threads,
         // 3888779264 was observed on an x86_64-linux-gnu host.
         .max_rss = 4000000000,
     }));
@@ -489,6 +491,7 @@ pub fn build(b: *std.Build) !void {
         .skip_linux = skip_linux,
         .skip_llvm = skip_llvm,
         .skip_libc = skip_libc,
+        .llvm_codegen_threads = llvm_codegen_threads,
     }));
 
     test_modules_step.dependOn(tests.addModuleTests(b, .{
@@ -551,6 +554,7 @@ pub fn build(b: *std.Build) !void {
         .skip_linux = skip_linux,
         .skip_llvm = skip_llvm,
         .skip_libc = skip_libc,
+        .llvm_codegen_threads = llvm_codegen_threads,
         // I observed a value of 5605064704 on the M2 CI.
         .max_rss = 6165571174,
     }));
diff --git a/test/tests.zig b/test/tests.zig
index 6b14685a6f84..3f4925026969 100644
--- a/test/tests.zig
+++ b/test/tests.zig
@@ -2242,6 +2242,7 @@ const ModuleTestOptions = struct {
     skip_non_native: bool,
     skip_freebsd: bool,
     skip_netbsd: bool,
+    llvm_codegen_threads: u32 = 0,
     skip_windows: bool,
     skip_macos: bool,
     skip_linux: bool,
@@ -2351,6 +2352,7 @@ pub fn addModuleTests(b: *std.Build, options: ModuleTestOptions) *Step {
             .zig_lib_dir = b.path("lib"),
         });
         these_tests.linkage = test_target.linkage;
+        if (would_use_llvm) these_tests.llvm_codegen_threads = options.llvm_codegen_threads;
         if (options.no_builtin) these_tests.root_module.no_builtin = false;
         if (options.build_options) |build_options| {
             these_tests.root_module.addOptions("build_options", build_options);

From 5ba55ca3fb82ef0aee788c1d72a683439d4aaaed Mon Sep 17 00:00:00 2001
From: Alistair Smith <hi@alistair.sh>
Date: Thu, 9 Apr 2026 16:49:18 -0700
Subject: [PATCH 04/15] shard: fix COFF anon dups, module-asm routing,
 cross-shard @export aliases; IES yield
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- llvm/Builder.zig + ir.zig: add COMDAT support (MODULE_CODE_COMDAT
  records, Variable.comdat field, addComdat). Required for COFF — without
  comdat any, linkonce_odr emits as a strong def per shard and lld-link
  rejects ~350 duplicate __anon_* symbols.
- codegen/llvm.zig resolveGlobalUav/updateExportedValue: setComdat(.any)
  on COFF for sharded linkonce_odr uavs.
- Zcu.navShard: switch from fqn-hash to file-hash via File.computeShard;
  add analUnitShard mapping comptime/nav/func units to their file's shard.
- codegen/llvm.zig genModuleLevelAssembly: route each global asm block to
  its source file's shard so .set aliases resolve against same-module defs.
- codegen/llvm.zig PartitionSet.updateExports: broadcast to all shards;
  Object.updateExports collapses non-owner extern globals onto one
  canonical decl so InstCombine cannot fold &a==&b to false pre-link.
- Zcu.isClaimedByOther + Sema.resolveInferredErrorSet: when the IES func
  is claimed by another thread, set tls_retry_loop and yield (cap 8)
  instead of parking in claimOrWait. Reuses existing requeue path.
---
 lib/std/zig/llvm/Builder.zig |  66 ++++++++++++++++++++-
 lib/std/zig/llvm/ir.zig      |  16 ++++-
 src/Sema.zig                 |  25 +++++++-
 src/Zcu.zig                  |  34 ++++++++++-
 src/codegen/llvm.zig         | 111 ++++++++++++++++++++++++++++++-----
 5 files changed, 232 insertions(+), 20 deletions(-)

diff --git a/lib/std/zig/llvm/Builder.zig b/lib/std/zig/llvm/Builder.zig
index 61c4d1b44e3a..dd35ab617b12 100644
--- a/lib/std/zig/llvm/Builder.zig
+++ b/lib/std/zig/llvm/Builder.zig
@@ -42,6 +42,7 @@ next_unique_global_id: std.AutoHashMapUnmanaged(StrtabString, u32),
 aliases: std.ArrayListUnmanaged(Alias),
 variables: std.ArrayListUnmanaged(Variable),
 functions: std.ArrayListUnmanaged(Function),
+comdats: std.ArrayListUnmanaged(Comdat),
 
 strtab_string_map: std.AutoArrayHashMapUnmanaged(void, void),
 strtab_string_indices: std.ArrayListUnmanaged(u32),
@@ -2513,12 +2514,37 @@ pub const Alias = struct {
     };
 };
 
+pub const Comdat = struct {
+    name: StrtabString,
+    kind: SelectionKind,
+
+    /// Matches LLVM's bitc::ComdatSelectionKindCodes.
+    pub const SelectionKind = enum(u3) {
+        any = 1,
+        exactmatch = 2,
+        largest = 3,
+        nodeduplicate = 4,
+        samesize = 5,
+    };
+
+    pub const Index = enum(u32) {
+        /// Stored 1-based to match the bitcode encoding (0 = no comdat).
+        none = 0,
+        _,
+
+        pub fn ptrConst(self: Index, builder: *const Builder) *const Comdat {
+            return &builder.comdats.items[@intFromEnum(self) - 1];
+        }
+    };
+};
+
 pub const Variable = struct {
     global: Global.Index,
     thread_local: ThreadLocal = .default,
     mutability: Mutability = .global,
     init: Constant = .no_init,
     section: String = .none,
+    comdat: Comdat.Index = .none,
     alignment: Alignment = .default,
 
     pub const Index = enum(u32) {
@@ -2595,6 +2621,10 @@ pub const Variable = struct {
             self.ptr(builder).section = section;
         }
 
+        pub fn setComdat(self: Index, comdat: Comdat.Index, builder: *Builder) void {
+            self.ptr(builder).comdat = comdat;
+        }
+
         pub fn setAlignment(self: Index, alignment: Alignment, builder: *Builder) void {
             self.ptr(builder).alignment = alignment;
         }
@@ -8603,6 +8633,7 @@ pub fn init(options: Options) Allocator.Error!Builder {
         .aliases = .{},
         .variables = .{},
         .functions = .{},
+        .comdats = .{},
 
         .strtab_string_map = .{},
         .strtab_string_indices = .{},
@@ -8752,6 +8783,7 @@ pub fn deinit(self: *Builder) void {
     self.variables.deinit(self.gpa);
     for (self.functions.items) |*function| function.deinit(self.gpa);
     self.functions.deinit(self.gpa);
+    self.comdats.deinit(self.gpa);
 
     self.strtab_string_map.deinit(self.gpa);
     self.strtab_string_indices.deinit(self.gpa);
@@ -9007,6 +9039,16 @@ pub fn addAliasAssumeCapacity(
     return alias_index;
 }
 
+pub fn addComdat(
+    self: *Builder,
+    name: StrtabString,
+    kind: Comdat.SelectionKind,
+) Allocator.Error!Comdat.Index {
+    assert(!name.isAnon());
+    try self.comdats.append(self.gpa, .{ .name = name, .kind = kind });
+    return @enumFromInt(self.comdats.items.len);
+}
+
 pub fn addVariable(
     self: *Builder,
     name: StrtabString,
@@ -9564,6 +9606,14 @@ pub fn print(self: *Builder, w: *Writer) (Writer.Error || Allocator.Error)!void
         , .{ id.fmt(self), ty.fmt(self, .default) });
     }
 
+    if (self.comdats.items.len > 0) {
+        if (need_newline) try w.writeByte('\n') else need_newline = true;
+        for (self.comdats.items) |comdat| try w.print(
+            \\${f} = comdat {s}
+            \\
+        , .{ comdat.name.fmt(self, .quote_unless_valid_identifier), @tagName(comdat.kind) });
+    }
+
     if (self.variables.items.len > 0) {
         if (need_newline) try w.writeByte('\n') else need_newline = true;
         for (self.variables.items) |variable| {
@@ -9572,7 +9622,7 @@ pub fn print(self: *Builder, w: *Writer) (Writer.Error || Allocator.Error)!void
             metadata_formatter.need_comma = true;
             defer metadata_formatter.need_comma = undefined;
             try w.print(
-                \\{f} ={f}{f}{f}{f}{f}{f}{f}{f} {s} {f}{f}{f}{f}
+                \\{f} ={f}{f}{f}{f}{f}{f}{f}{f} {s} {f}{f}{s}{f}{f}
                 \\
             , .{
                 variable.global.fmt(self),
@@ -9589,6 +9639,7 @@ pub fn print(self: *Builder, w: *Writer) (Writer.Error || Allocator.Error)!void
                 @tagName(variable.mutability),
                 global.type.fmt(self, .percent),
                 variable.init.fmt(self, .{ .space = true }),
+                if (variable.comdat != .none) ", comdat" else "",
                 variable.alignment.fmt(", "),
                 try metadata_formatter.fmt("!dbg ", global.dbg, null),
             });
@@ -13663,6 +13714,18 @@ pub fn toBitcode(self: *Builder, allocator: Allocator, producer: Producer) bitco
             defer section_map.deinit(self.gpa);
             try section_map.ensureUnusedCapacity(self.gpa, globals.count());
 
+            // COMDAT records must precede any global that references them by index.
+            for (self.comdats.items) |comdat| {
+                const name_index = comdat.name.toIndex().?;
+                const offset = self.strtab_string_indices.items[name_index];
+                const size = self.strtab_string_indices.items[name_index + 1] - offset;
+                try module_block.writeAbbrev(Module.Comdat{
+                    .strtab_offset = offset,
+                    .strtab_size = size,
+                    .selection_kind = comdat.kind,
+                });
+            }
+
             for (self.variables.items) |variable| {
                 if (variable.global.getReplacement(self) != .none) continue;
 
@@ -13706,6 +13769,7 @@ pub fn toBitcode(self: *Builder, allocator: Allocator, producer: Producer) bitco
                     .unnamed_addr = global.unnamed_addr,
                     .externally_initialized = global.externally_initialized,
                     .dllstorageclass = global.dll_storage_class,
+                    .comdat = @intFromEnum(variable.comdat),
                     .preemption = global.preemption,
                 });
             }
diff --git a/lib/std/zig/llvm/ir.zig b/lib/std/zig/llvm/ir.zig
index 824186efb876..d2c769c20cdf 100644
--- a/lib/std/zig/llvm/ir.zig
+++ b/lib/std/zig/llvm/ir.zig
@@ -193,6 +193,7 @@ pub const Module = struct {
         Variable,
         Function,
         Alias,
+        Comdat,
     };
 
     pub const Version = struct {
@@ -211,6 +212,18 @@ pub const Module = struct {
         string: []const u8,
     };
 
+    pub const Comdat = struct {
+        pub const ops = [_]AbbrevOp{
+            .{ .literal = 12 }, // MODULE_CODE_COMDAT
+            .{ .vbr = 16 }, // strtab_offset
+            .{ .vbr = 16 }, // strtab_size
+            .{ .fixed = @bitSizeOf(Builder.Comdat.SelectionKind) },
+        };
+        strtab_offset: usize,
+        strtab_size: usize,
+        selection_kind: Builder.Comdat.SelectionKind,
+    };
+
     pub const Variable = struct {
         const AddrSpaceAndIsConst = packed struct {
             is_const: bool,
@@ -233,7 +246,7 @@ pub const Module = struct {
             .{ .fixed = @bitSizeOf(Builder.UnnamedAddr) },
             .{ .fixed = @bitSizeOf(Builder.ExternallyInitialized) },
             .{ .fixed = @bitSizeOf(Builder.DllStorageClass) },
-            .{ .literal = 0 }, // comdat
+            .{ .vbr = 16 }, // comdat
             .{ .literal = 0 }, // attributes
             .{ .fixed = @bitSizeOf(Builder.Preemption) },
         };
@@ -250,6 +263,7 @@ pub const Module = struct {
         unnamed_addr: Builder.UnnamedAddr,
         externally_initialized: Builder.ExternallyInitialized,
         dllstorageclass: Builder.DllStorageClass,
+        comdat: u32,
         preemption: Builder.Preemption,
     };
 
diff --git a/src/Sema.zig b/src/Sema.zig
index 88a5c60e49d0..8452d2305943 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -35280,7 +35280,30 @@ fn resolveInferredErrorSet(
         // In this case we are dealing with the actual InferredErrorSet object that
         // corresponds to the function, not one created to track an inline/comptime call.
         const orig_func_index = ip.unwrapCoercedFunc(func_index);
-        try sema.addReferenceEntry(block, src, .wrap(.{ .func = orig_func_index }));
+        const ies_unit: AnalUnit = .wrap(.{ .func = orig_func_index });
+        if (zcu.parallel_sema and zcu.isClaimedByOther(ies_unit)) {
+            // The IES owner's body is being analysed by another worker right
+            // now. `ensureFuncBodyUpToDate` would park this thread on
+            // `sema_claim_cond` until that worker finishes — under chained
+            // IES dependencies that idles a core for the bulk of Sema.
+            // Instead, yield: re-queue `sema.owner` and let this thread
+            // pull the next job. Cap retries low because each one re-runs
+            // the caller's body from scratch; once the cap is hit, fall
+            // through to the blocking wait.
+            zcu.sema_retry_mutex.lock();
+            const tries: u8 = blk: {
+                const gop = zcu.sema_retry_counts.getOrPut(zcu.gpa, ies_unit) catch break :blk 255;
+                if (!gop.found_existing) gop.value_ptr.* = 0;
+                gop.value_ptr.* +|= 1;
+                break :blk gop.value_ptr.*;
+            };
+            zcu.sema_retry_mutex.unlock();
+            if (tries < 8) {
+                Zcu.tls_retry_loop = sema.owner;
+                return error.AnalysisFail;
+            }
+        }
+        try sema.addReferenceEntry(block, src, ies_unit);
         try pt.ensureFuncBodyUpToDate(orig_func_index);
     }
 
diff --git a/src/Zcu.zig b/src/Zcu.zig
index be7bedc06138..e2c2ae53ac60 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -3742,6 +3742,19 @@ pub fn releaseClaim(zcu: *Zcu, unit: AnalUnit) void {
     zcu.sema_claim_cond.broadcast();
 }
 
+/// Returns true if `unit` is currently claimed for analysis by a thread other
+/// than the caller. Used by `Sema.resolveInferredErrorSet` to yield-and-requeue
+/// instead of parking on `sema_claim_cond` when a dependency IES is already in
+/// progress on another worker.
+pub fn isClaimedByOther(zcu: *Zcu, unit: AnalUnit) bool {
+    if (!zcu.parallel_sema) return false;
+    const me = std.Thread.getCurrentId();
+    zcu.unit_claims_mutex.lock();
+    defer zcu.unit_claims_mutex.unlock();
+    const owner = zcu.unit_claims.get(unit) orelse return false;
+    return owner != me;
+}
+
 /// Under parallel Sema, `analysis_in_progress` is per-OS-thread (lock-free).
 threadlocal var tls_aip: std.AutoArrayHashMapUnmanaged(AnalUnit, void) = .empty;
 /// Set by `ensureNavResolved` when a dependency loop is detected under
@@ -4650,10 +4663,27 @@ pub fn navFileScope(zcu: *Zcu, nav: InternPool.Nav.Index) *File {
 }
 
 pub fn navShard(zcu: *Zcu, nav: InternPool.Nav.Index, n: u32) u32 {
+    if (n <= 1) return 0;
+    return zcu.navFileScope(nav).computeShard(n);
+}
+
+/// Returns the LLVM codegen shard that owns `unit`. Module-level assembly is
+/// keyed by `AnalUnit`; routing each asm string to the same shard as the
+/// same-file navs it references (e.g. `.set` alias targets) lets the integrated
+/// assembler resolve those symbols and avoids emitting any string twice.
+pub fn analUnitShard(zcu: *Zcu, unit: AnalUnit, n: u32) u32 {
     if (n <= 1) return 0;
     const ip = &zcu.intern_pool;
-    const fqn = ip.getNav(nav).fqn.toSlice(ip);
-    return @intCast(std.hash.Wyhash.hash(0, fqn) % n);
+    return switch (unit.unwrap()) {
+        .@"comptime" => |cu_id| s: {
+            const cu = ip.getComptimeUnit(cu_id);
+            const resolved = cu.zir_index.resolveFull(ip) orelse break :s 0;
+            break :s zcu.fileByIndex(resolved.file).computeShard(n);
+        },
+        .nav_val, .nav_ty => |nav| zcu.navShard(nav, n),
+        .func => |func| zcu.navShard(zcu.funcInfo(func).owner_nav, n),
+        .type, .memoized_state => 0,
+    };
 }
 
 pub fn fmtAnalUnit(zcu: *Zcu, unit: AnalUnit) std.fmt.Formatter(FormatAnalUnit, formatAnalUnit) {
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 1668bf67270f..97d28bc5579c 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -554,13 +554,16 @@ pub const PartitionSet = struct {
         exported: Zcu.Exported,
         export_indices: []const Zcu.Export.Index,
     ) link.File.UpdateExportsError!void {
-        const shard: u32 = switch (exported) {
-            .nav => |nav| pt.zcu.navShard(nav, self.n),
-            .uav => 0,
-        };
-        self.mutexes[shard].lock();
-        defer self.mutexes[shard].unlock();
-        return self.objects[shard].updateExports(pt, exported, export_indices);
+        // Route to every shard: the owning shard emits the definition and
+        // aliases; non-owning shards collapse any bare extern declarations
+        // they hold for these export names onto one canonical decl so LLVM
+        // cannot constant-fold `icmp eq @a, @b` between distinct externs to
+        // `false` before the linker has a chance to unify them.
+        for (self.objects, self.mutexes) |obj, *m| {
+            m.lock();
+            defer m.unlock();
+            try obj.updateExports(pt, exported, export_indices);
+        }
     }
 
     pub fn emit(self: *PartitionSet, pt: Zcu.PerThread, options: Object.EmitOptions) error{ LinkFailure, OutOfMemory }!void {
@@ -928,11 +931,13 @@ pub const Object = struct {
     }
 
     fn genModuleLevelAssembly(object: *Object, pt: Zcu.PerThread) Allocator.Error!void {
-        if (object.isSharded() and object.partition_id != 0) return;
         const b = &object.builder;
         const gpa = b.gpa;
+        const zcu = pt.zcu;
         b.module_asm.clearRetainingCapacity();
-        for (pt.zcu.global_assembly.values()) |assembly| {
+        const n: u32 = if (object.partition_set) |ps| ps.n else 1;
+        for (zcu.global_assembly.keys(), zcu.global_assembly.values()) |unit, assembly| {
+            if (zcu.analUnitShard(unit, n) != object.partition_id) continue;
             try b.module_asm.ensureUnusedCapacity(gpa, assembly.len + 1);
             b.module_asm.appendSliceAssumeCapacity(assembly);
             b.module_asm.appendAssumeCapacity('\n');
@@ -1805,15 +1810,76 @@ pub const Object = struct {
         export_indices: []const Zcu.Export.Index,
     ) link.File.UpdateExportsError!void {
         const zcu = pt.zcu;
+        const ip = &zcu.intern_pool;
         const nav_index = switch (exported) {
             .nav => |nav| nav,
             .uav => |uav| {
-                if (self.isSharded() and self.partition_id != 0) return;
+                if (self.isSharded() and self.partition_id != 0) {
+                    // Non-owning shard: collapse any extern decls held for these
+                    // export names onto this shard's `__anon_{idx}` so address
+                    // comparisons survive InstCombine. If this shard never
+                    // referenced the uav directly there is nothing local to
+                    // compare against, so skip.
+                    const canonical = self.uav_map.get(uav) orelse return;
+                    for (export_indices) |export_idx| {
+                        const exp_name = export_idx.ptr(zcu).opts.name.toSlice(ip);
+                        const s = self.builder.strtabStringIfExists(exp_name) orelse continue;
+                        const existing = self.builder.getGlobal(s) orelse continue;
+                        if (existing == canonical) continue;
+                        switch (existing.ptrConst(&self.builder).kind) {
+                            .variable, .function => {
+                                try existing.rename(.empty, &self.builder);
+                                try existing.replace(canonical, &self.builder);
+                            },
+                            .alias, .replaced => {},
+                        }
+                    }
+                    return;
+                }
                 return updateExportedValue(self, pt, uav, export_indices);
             },
         };
-        if (!self.ownsNav(zcu, nav_index)) return;
-        const ip = &zcu.intern_pool;
+        if (!self.ownsNav(zcu, nav_index)) {
+            // Non-owning shard may hold bare extern decls for one or more of
+            // these export names (created by resolveGlobalNav/resolveLlvmFunction
+            // on the synthetic extern owner Nav). LLVM constant-folds
+            // `icmp eq @a, @b` to `false` for any two distinct external
+            // GlobalValues, so collapse them onto one canonical declaration of
+            // the underlying nav — the same `{fqn}__N{idx}` symbol the owning
+            // shard exports its definition under.
+            if (export_indices.len == 0) return;
+            var any = false;
+            for (export_indices) |export_idx| {
+                const exp_name = export_idx.ptr(zcu).opts.name.toSlice(ip);
+                if (self.builder.strtabStringIfExists(exp_name)) |s|
+                    if (self.builder.getGlobal(s) != null) {
+                        any = true;
+                        break;
+                    };
+            }
+            if (!any) return;
+            const nav = ip.getNav(nav_index);
+            const is_fn = ip.isFunctionType(nav.typeOf(ip));
+            const canonical: Builder.Global.Index = if (is_fn)
+                (try self.resolveLlvmFunction(pt, nav_index)).ptrConst(&self.builder).global
+            else
+                (try self.resolveGlobalNav(pt, nav_index)).ptrConst(&self.builder).global;
+            canonical.setUnnamedAddr(.default, &self.builder);
+            for (export_indices) |export_idx| {
+                const exp_name = export_idx.ptr(zcu).opts.name.toSlice(ip);
+                const s = self.builder.strtabStringIfExists(exp_name) orelse continue;
+                const existing = self.builder.getGlobal(s) orelse continue;
+                if (existing == canonical) continue;
+                switch (existing.ptrConst(&self.builder).kind) {
+                    .variable, .function => {
+                        try existing.rename(.empty, &self.builder);
+                        try existing.replace(canonical, &self.builder);
+                    },
+                    .alias, .replaced => {},
+                }
+            }
+            return;
+        }
         const global_index = self.nav_map.get(nav_index) orelse gi: {
             // The nav was exported but its `link_nav` / `codegen_func` job
             // never ran (likely a post-commit retry under parallel Sema dropping
@@ -1912,7 +1978,13 @@ pub const Object = struct {
             try variable_index.setInitializer(init_val, &o.builder);
             if (o.isSharded()) {
                 variable_index.setLinkage(.linkonce_odr, &o.builder);
-                variable_index.setVisibility(.hidden, &o.builder);
+                if (o.target.ofmt == .coff) {
+                    // See resolveGlobalUav: COFF needs an explicit comdat for
+                    // linkonce_odr to dedup across shard objects.
+                    variable_index.setComdat(try o.builder.addComdat(def_name, .any), &o.builder);
+                } else {
+                    variable_index.setVisibility(.hidden, &o.builder);
+                }
                 variable_index.setMutability(.constant, &o.builder);
             }
             break :i global_index;
@@ -3327,8 +3399,9 @@ pub const Object = struct {
         const zcu = pt.zcu;
         const decl_ty = zcu.intern_pool.typeOf(uav);
 
+        const def_name = try o.builder.strtabStringFmt("__anon_{d}", .{@intFromEnum(uav)});
         const variable_index = try o.builder.addVariable(
-            try o.builder.strtabStringFmt("__anon_{d}", .{@intFromEnum(uav)}),
+            def_name,
             try o.lowerType(pt, Type.fromInterned(decl_ty)),
             llvm_addr_space,
         );
@@ -3341,7 +3414,15 @@ pub const Object = struct {
             // canonical definition per `__anon_{ip_index}` and let the linker
             // coalesce duplicates so every shard sees the same address.
             variable_index.setLinkage(.linkonce_odr, &o.builder);
-            variable_index.setVisibility(.hidden, &o.builder);
+            // ELF/Wasm lower linkonce_odr to a weak definition automatically; COFF
+            // does not, so without an explicit comdat lld-link sees N strong
+            // `__anon_N` symbols and rejects the link. MachO forbids comdats but
+            // also coalesces linkonce_odr on its own.
+            if (o.target.ofmt == .coff) {
+                variable_index.setComdat(try o.builder.addComdat(def_name, .any), &o.builder);
+            } else {
+                variable_index.setVisibility(.hidden, &o.builder);
+            }
         } else {
             variable_index.setLinkage(.internal, &o.builder);
         }

From e116b29c7d303142f0fe6134bc011bca2c5105a5 Mon Sep 17 00:00:00 2001
From: Alistair Smith <hi@alistair.sh>
Date: Thu, 9 Apr 2026 17:22:44 -0700
Subject: [PATCH 05/15] psema: fix self-ref global false-positive cycle +
 cross-thread field_types_wip; fork bugfixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Sema.analyzeNavRefInner: revert is_ref to .type-only resolve under
  parallel_sema (the .fully override created a structural self-dep on
  nav_val for 'const foo = .{ .self = &foo }'). The torn-read concern
  was unfounded — getNav returns by-value and isExternOrFn handles both
  status arms; the extern/fn branch already re-ensures .fully before
  dereferencing .fully_resolved.val.
- Type.hasRuntimeBitsInner/comptimeOnlyInner: gate the four
  .field_types_wip self-recursion shortcuts on !isClaimedByOther so a
  wip flag set by another worker falls through to claimOrWait instead of
  poisoning assumed_runtime_bits.
- main.zig: -fno-sanitize=address was hardcoded =true (pre-existing).
- lib/std/os/linux.zig: clock_getres/settime @intFromEnum on clockid_t
  (pre-existing; @as(isize, enum) is invalid).
---
 lib/std/os/linux.zig |  4 ++--
 src/Sema.zig         | 17 ++++++++++------
 src/Type.zig         | 46 +++++++++++++++++++++++++++++++++++---------
 src/main.zig         |  2 +-
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig
index 083453d1d211..b9873359cc80 100644
--- a/lib/std/os/linux.zig
+++ b/lib/std/os/linux.zig
@@ -1651,7 +1651,7 @@ fn init_vdso_clock_gettime(clk: clockid_t, ts: *timespec) callconv(.c) usize {
 pub fn clock_getres(clk_id: clockid_t, tp: *timespec) usize {
     return syscall2(
         if (@hasField(SYS, "clock_getres")) .clock_getres else .clock_getres_time64,
-        @as(usize, @bitCast(@as(isize, clk_id))),
+        @as(usize, @bitCast(@as(isize, @intFromEnum(clk_id)))),
         @intFromPtr(tp),
     );
 }
@@ -1659,7 +1659,7 @@ pub fn clock_getres(clk_id: clockid_t, tp: *timespec) usize {
 pub fn clock_settime(clk_id: clockid_t, tp: *const timespec) usize {
     return syscall2(
         if (@hasField(SYS, "clock_settime")) .clock_settime else .clock_settime64,
-        @as(usize, @bitCast(@as(isize, clk_id))),
+        @as(usize, @bitCast(@as(isize, @intFromEnum(clk_id)))),
         @intFromPtr(tp),
     );
 }
diff --git a/src/Sema.zig b/src/Sema.zig
index 8452d2305943..d66c1ef26367 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -31503,12 +31503,17 @@ fn analyzeNavRefInner(sema: *Sema, block: *Block, src: LazySrcLoc, orig_nav_inde
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
 
-    // Under parallel Sema another thread may transition the nav from
-    // .type_resolved → .fully_resolved between our ensureNavResolved and
-    // the getNav below, leaving a torn read in `isExternOrFn`. Fully
-    // resolving here serialises via claimOrWait so the subsequent getNav
-    // observes a stable status.
-    try sema.ensureNavResolved(block, src, orig_nav_index, if (is_ref and !zcu.parallel_sema) .type else .fully);
+    // For `is_ref` we resolve only `.type`: self-referential globals
+    // (`const foo: T = .{ .self = &foo }`) require the lazy nav-ptr path
+    // below, which only needs the type. Forcing `.fully` here under
+    // parallel_sema made `semaAipContains(.nav_val)` fire on the in-progress
+    // unit and retry-exhaust into a spurious "dependency loop detected".
+    // A concurrent .type_resolved -> .fully_resolved transition between
+    // here and the `getNav` below is harmless: `getNav` returns a by-value
+    // snapshot, `isExternOrFn` and the status switches below handle both
+    // arms, and the extern/fn branch re-ensures `.fully` before reading
+    // `.fully_resolved.val`.
+    try sema.ensureNavResolved(block, src, orig_nav_index, if (is_ref) .type else .fully);
 
     const nav_index = nav: {
         if (ip.getNav(orig_nav_index).isExternOrFn(ip)) {
diff --git a/src/Type.zig b/src/Type.zig
index 1659199d849f..04b2d0bb7100 100644
--- a/src/Type.zig
+++ b/src/Type.zig
@@ -566,10 +566,18 @@ pub fn hasRuntimeBitsInner(
             },
             .struct_type => {
                 const struct_type = ip.loadStructType(ty.toIntern());
-                if (strat != .eager and struct_type.assumeRuntimeBitsIfFieldTypesWip(ip)) {
-                    // In this case, we guess that hasRuntimeBits() for this type is true,
-                    // and then later if our guess was incorrect, we emit a compile error.
-                    return true;
+                if (strat != .eager) {
+                    // Under parallel Sema, .field_types_wip may belong to another worker
+                    // (the queued resolve_type_fully job). Only treat it as self-recursion
+                    // if WE hold the claim; otherwise fall through to resolveFields which
+                    // will block on claimOrWait instead of poisoning assumed_runtime_bits.
+                    if (!zcu.isClaimedByOther(.wrap(.{ .type = ty.toIntern() })) and
+                        struct_type.assumeRuntimeBitsIfFieldTypesWip(ip))
+                    {
+                        // In this case, we guess that hasRuntimeBits() for this type is true,
+                        // and then later if our guess was incorrect, we emit a compile error.
+                        return true;
+                    }
                 }
                 switch (strat) {
                     .sema => try ty.resolveFields(strat.pt(zcu, tid)),
@@ -603,9 +611,17 @@ pub fn hasRuntimeBitsInner(
                 const union_flags = union_type.flagsUnordered(ip);
                 switch (union_flags.runtime_tag) {
                     .none => if (strat != .eager) {
-                        // In this case, we guess that hasRuntimeBits() for this type is true,
-                        // and then later if our guess was incorrect, we emit a compile error.
-                        if (union_type.assumeRuntimeBitsIfFieldTypesWip(ip)) return true;
+                        // Under parallel Sema, .field_types_wip may belong to another worker
+                        // (the queued resolve_type_fully job). Only treat it as self-recursion
+                        // if WE hold the claim; otherwise fall through to resolveFields which
+                        // will block on claimOrWait instead of poisoning assumed_runtime_bits.
+                        if (!zcu.isClaimedByOther(.wrap(.{ .type = ty.toIntern() })) and
+                            union_type.assumeRuntimeBitsIfFieldTypesWip(ip))
+                        {
+                            // In this case, we guess that hasRuntimeBits() for this type is true,
+                            // and then later if our guess was incorrect, we emit a compile error.
+                            return true;
+                        }
                     },
                     .safety, .tagged => {},
                 }
@@ -2830,7 +2846,13 @@ pub fn comptimeOnlyInner(
                         .no, .wip => false,
                         .yes => true,
                         .unknown => {
-                            if (struct_type.flagsUnordered(ip).field_types_wip) {
+                            // Under parallel Sema, .field_types_wip set by another worker is
+                            // concurrent progress, not self-recursion: fall through to
+                            // resolveFields (which blocks on claimOrWait) rather than
+                            // guessing `false` and caching a wrong requires_comptime.
+                            if (struct_type.flagsUnordered(ip).field_types_wip and
+                                !zcu.isClaimedByOther(.wrap(.{ .type = ty.toIntern() })))
+                            {
                                 struct_type.setRequiresComptime(ip, .unknown);
                                 return false;
                             }
@@ -2891,7 +2913,13 @@ pub fn comptimeOnlyInner(
                         .no, .wip => return false,
                         .yes => return true,
                         .unknown => {
-                            if (union_type.flagsUnordered(ip).status == .field_types_wip) {
+                            // Under parallel Sema, .field_types_wip set by another worker is
+                            // concurrent progress, not self-recursion: fall through to
+                            // resolveFields (which blocks on claimOrWait) rather than
+                            // guessing `false` and caching a wrong requires_comptime.
+                            if (union_type.flagsUnordered(ip).status == .field_types_wip and
+                                !zcu.isClaimedByOther(.wrap(.{ .type = ty.toIntern() })))
+                            {
                                 union_type.setRequiresComptime(ip, .unknown);
                                 return false;
                             }
diff --git a/src/main.zig b/src/main.zig
index 6bf88820dd7c..3c6790420163 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -2273,7 +2273,7 @@ fn buildOutputType(
                                 mod_opts.sanitize_thread = enable;
                                 recognized_any = true;
                             } else if (mem.eql(u8, sub_arg, "address")) {
-                                mod_opts.sanitize_address = true;
+                                mod_opts.sanitize_address = enable;
                                 recognized_any = true;
                             } else if (mem.eql(u8, sub_arg, "fuzzer") or mem.eql(u8, sub_arg, "fuzzer-no-link")) {
                                 mod_opts.fuzz = enable;

From 3298e3f420f1f498f988125978a825b8614ed550 Mon Sep 17 00:00:00 2001
From: Alistair Smith <hi@alistair.sh>
Date: Thu, 9 Apr 2026 17:31:38 -0700
Subject: [PATCH 06/15] test/behavior/cast: check error-set membership not
 @typeInfo order

Under parallel sema, error-name InternPool indices (and thus the
index-sorted @typeInfo order) depend on which thread interns first.
The language does not specify error-set @typeInfo ordering; check
membership instead.
---
 test/behavior/cast.zig      | 45 ++++++++++++++++++++-----------------
 test/behavior/type_info.zig | 15 +++++++++----
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/test/behavior/cast.zig b/test/behavior/cast.zig
index e55a4dc20cb4..7019857bd332 100644
--- a/test/behavior/cast.zig
+++ b/test/behavior/cast.zig
@@ -8,6 +8,11 @@ const mem = std.mem;
 const maxInt = std.math.maxInt;
 const native_endian = builtin.target.cpu.arch.endian();
 
+fn errorSetContains(set: []const std.builtin.Type.Error, name: []const u8) bool {
+    for (set) |e| if (mem.eql(u8, e.name, name)) return true;
+    return false;
+}
+
 test "int to ptr cast" {
     if (builtin.zig_backend == .stage2_spirv) return error.SkipZigTest;
 
@@ -885,8 +890,8 @@ test "peer type resolution: error set supersets" {
         const error_set_info = @typeInfo(ty);
         try expect(error_set_info == .error_set);
         try expect(error_set_info.error_set.?.len == 2);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
     }
 
     // B superset of A
@@ -895,8 +900,8 @@ test "peer type resolution: error set supersets" {
         const error_set_info = @typeInfo(ty);
         try expect(error_set_info == .error_set);
         try expect(error_set_info.error_set.?.len == 2);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
     }
 }
 
@@ -913,9 +918,9 @@ test "peer type resolution: disjoint error sets" {
         const error_set_info = @typeInfo(ty);
         try expect(error_set_info == .error_set);
         try expect(error_set_info.error_set.?.len == 3);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[2].name, "Three"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Three"));
     }
 
     {
@@ -923,9 +928,9 @@ test "peer type resolution: disjoint error sets" {
         const error_set_info = @typeInfo(ty);
         try expect(error_set_info == .error_set);
         try expect(error_set_info.error_set.?.len == 3);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[2].name, "Three"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Three"));
     }
 }
 
@@ -944,9 +949,9 @@ test "peer type resolution: error union and error set" {
 
         const error_set_info = @typeInfo(info.error_union.error_set);
         try expect(error_set_info.error_set.?.len == 3);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[2].name, "Three"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Three"));
     }
 
     {
@@ -956,9 +961,9 @@ test "peer type resolution: error union and error set" {
 
         const error_set_info = @typeInfo(info.error_union.error_set);
         try expect(error_set_info.error_set.?.len == 3);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[2].name, "Three"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Three"));
     }
 }
 
@@ -978,8 +983,8 @@ test "peer type resolution: error union after non-error" {
 
         const error_set_info = @typeInfo(info.error_union.error_set);
         try expect(error_set_info.error_set.?.len == 2);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
     }
 
     {
@@ -990,8 +995,8 @@ test "peer type resolution: error union after non-error" {
 
         const error_set_info = @typeInfo(info.error_union.error_set);
         try expect(error_set_info.error_set.?.len == 2);
-        try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-        try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
+        try expect(errorSetContains(error_set_info.error_set.?, "One"));
+        try expect(errorSetContains(error_set_info.error_set.?, "Two"));
     }
 }
 
diff --git a/test/behavior/type_info.zig b/test/behavior/type_info.zig
index 48b10c458aa3..cd17b25baae7 100644
--- a/test/behavior/type_info.zig
+++ b/test/behavior/type_info.zig
@@ -9,6 +9,11 @@ const assert = std.debug.assert;
 const expect = std.testing.expect;
 const expectEqualStrings = std.testing.expectEqualStrings;
 
+fn errorSetContains(set: []const Type.Error, name: []const u8) bool {
+    for (set) |e| if (mem.eql(u8, e.name, name)) return true;
+    return false;
+}
+
 test "type info: integer, floating point type info" {
     try testIntFloat();
     try comptime testIntFloat();
@@ -176,7 +181,9 @@ fn testErrorSet() !void {
     const error_set_info = @typeInfo(TestErrorSet);
     try expect(error_set_info == .error_set);
     try expect(error_set_info.error_set.?.len == 3);
-    try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "First"));
+    try expect(errorSetContains(error_set_info.error_set.?, "First"));
+    try expect(errorSetContains(error_set_info.error_set.?, "Second"));
+    try expect(errorSetContains(error_set_info.error_set.?, "Third"));
 
     const error_union_info = @typeInfo(TestErrorSet!usize);
     try expect(error_union_info == .error_union);
@@ -211,9 +218,9 @@ test "type info: error set merged" {
     const error_set_info = @typeInfo(TestSet);
     try expect(error_set_info == .error_set);
     try expect(error_set_info.error_set.?.len == 3);
-    try expect(mem.eql(u8, error_set_info.error_set.?[0].name, "One"));
-    try expect(mem.eql(u8, error_set_info.error_set.?[1].name, "Two"));
-    try expect(mem.eql(u8, error_set_info.error_set.?[2].name, "Three"));
+    try expect(errorSetContains(error_set_info.error_set.?, "One"));
+    try expect(errorSetContains(error_set_info.error_set.?, "Two"));
+    try expect(errorSetContains(error_set_info.error_set.?, "Three"));
 }
 
 test "type info: enum info" {

From 51433bc13ac253e9f088f376a7bf4daca904a2fe Mon Sep 17 00:00:00 2001
From: root <root@ip-10-0-2-234.us-west-2.compute.internal>
Date: Sun, 19 Apr 2026 17:40:37 +0000
Subject: [PATCH 07/15] psema: shard unit_claims, drop sema_lock under
 non-incremental; misc fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ZIG_PARALLEL_SEMA on behavior.zig: ~18s serial → ~2.2s at j=16 (8.1x),
1.14x CPU overhead. 0/130 stress runs across j=8/16/32/64 + full exec.
With -fllvm --llvm-codegen-threads=32: 9.9s → 2.15s.

parallel sema:
- Zcu: shard unit_claims into 256 {mutex,cond,map,deferred,waiters};
  tryClaim/claimOrWait/releaseClaim/isClaimedByOther/deferOn lock only
  the unit's shard. claim_waits gets its own mutex; detectClaimCycle
  walks via tryLock peeks (skip on contended foreign shard).
- Zcu: tryClaim() non-blocking; ensureFuncBodyUpToDate top-level skips
  on busy instead of parking.
- Replace sema_lock under parallel non-incremental with fine-grained
  locks: embed_mutex, global_assembly_mutex, file_system_inputs_mutex,
  per-Namespace decls_mutex, comp.mutex for ensureFileAnalyzed.
  resolveStructInner/resolveUnionInner gated like the ensure* sites.
- Sema.resolveInferredErrorSet: drop the shared 8-retry yield cap; the
  nested ensureFuncBodyUpToDate blocks on the (now sharded) claim
  instead of re-running the caller body.
- awaitNamespaceTypeFinished: return .would_block instead of unbounded
  spin; callers yield-and-requeue. getNamespace/enumFieldIndex keep the
  spin variant per their finished-type contract.
- Compilation: work_queue_cond replaces the dispatch loop's
  Thread.yield() busy-spin; queueJob/workerAnalyzeFunc signal it.
- main: ReleaseSafe uses smp_allocator (debug_allocator's single mutex
  serialised every alloc and dominated wall time).
- ZIG_PSEMA_STATS counters.

races fixed:
- Type.comptimeOnlyInner .normal strat: .wip/.unknown observed under
  parallel sema → false (per documented contract) instead of unreachable.
- Zcu.maybeUnresolveIes: early-return under parallel non-incremental;
  the unlocked outdated.contains() raced scanDecl's writes.
- InternPool.getIfExists: skip .removed entries.

misc fixes from branch sweep:
- Package/Fetch: promoted lazy→eager dep is now appended to all_fetches
  (arena leak + double-fetch + dropped errors otherwise).
- std.fs.Dir.realpath windows ".": stack temp + NameTooLong, was
  slicing out_buffer to max_path_bytes unconditionally.
- link/Elf, link/MachO: use base.resolveZcuObjectPaths instead of
  open-coding the {stem}.{i}.o expansion.
- Compilation.dumpLlvmShardStats: use zcu.navShard (was hashing fqn,
  which doesn't match the file-path router).
- codegen/llvm: free the bin_filename_list gpa allocation.
- zig_llvm.cpp: delete dead getAsanOptions().
- target.zig: .@"async" → .async.
- libs/libcxx: @intFromBool instead of @as(u1, if ...).
---
 lib/std/fs/Dir.zig    |   8 +-
 src/Compilation.zig   |  69 ++++++++++--
 src/InternPool.zig    |  16 ++-
 src/Package/Fetch.zig |  23 +---
 src/Sema.zig          |  68 ++++++++----
 src/Type.zig          |  43 ++++---
 src/Zcu.zig           | 253 +++++++++++++++++++++++++++++++++---------
 src/Zcu/PerThread.zig |  52 ++++++---
 src/codegen/llvm.zig  |   3 +
 src/libs/libcxx.zig   |   6 +-
 src/link/Elf.zig      |  29 +----
 src/link/MachO.zig    |  19 +---
 src/main.zig          |   7 +-
 src/target.zig        |   2 +-
 src/zig_llvm.cpp      |  11 --
 15 files changed, 414 insertions(+), 195 deletions(-)

diff --git a/lib/std/fs/Dir.zig b/lib/std/fs/Dir.zig
index d3c0c3d21583..5cbe45132398 100644
--- a/lib/std/fs/Dir.zig
+++ b/lib/std/fs/Dir.zig
@@ -1309,8 +1309,12 @@ pub fn realpath(self: Dir, pathname: []const u8, out_buffer: []u8) RealPathError
     }
     if (native_os == .windows) {
         if (pathname.len == 1 and pathname[0] == '.') {
-            const ptr: *[std.fs.max_path_bytes]u8 = out_buffer[0..std.fs.max_path_bytes];
-            return try std.os.getFdPath(self.fd, ptr);
+            var buffer: [fs.max_path_bytes]u8 = undefined;
+            const out_path = try std.os.getFdPath(self.fd, &buffer);
+            if (out_path.len > out_buffer.len) return error.NameTooLong;
+            const result = out_buffer[0..out_path.len];
+            @memcpy(result, out_path);
+            return result;
         }
         const pathname_w = try windows.sliceToPrefixedFileW(self.fd, pathname);
         return self.realpathW(pathname_w.span(), out_buffer);
diff --git a/src/Compilation.zig b/src/Compilation.zig
index bf0f2a9dc2da..2987ec256f43 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -128,6 +128,10 @@ work_queues: [
 ]DeprecatedLinearFifo(Job),
 /// Protects `work_queues` when Sema runs on worker threads and calls `queueJob`.
 work_queue_mutex: std.Thread.Mutex = .{},
+/// Signalled by `queueJob` and when `sema_pending_jobs` reaches 0, so the
+/// dispatch loop in `performAllTheWork` parks instead of busy-spinning on
+/// `Thread.yield()` while parallel-Sema workers are running.
+work_queue_cond: std.Thread.Condition = .{},
 
 /// These jobs are to invoke the Clang compiler to create an object file, which
 /// gets linked with the Compilation.
@@ -278,6 +282,9 @@ no_merge_shards: bool,
 time_report: ?TimeReport,
 
 file_system_inputs: ?*std.ArrayListUnmanaged(u8),
+/// Guards `file_system_inputs` appends. Called from `newEmbedFile` (sema
+/// workers) and C-object workers concurrently under parallel sema.
+file_system_inputs_mutex: std.Thread.Mutex = .{},
 
 /// This is the digest of the cache for the current compilation.
 /// This digest will be known after update() is called.
@@ -3321,8 +3328,7 @@ fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void {
             skipped += 1;
             continue;
         }
-        const fqn = nav.fqn.toSlice(ip);
-        const shard: u8 = @intCast(std.hash.Wyhash.hash(0, fqn) % n);
+        const shard: u8 = @intCast(zcu.navShard(nav_index, n));
         counts[shard] += 1;
         const file = zcu.fileByIndex(nav.srcInst(ip).resolveFile(ip));
         const gop = per_file.getOrPut(.{ .file = file, .shard = shard }) catch continue;
@@ -3363,6 +3369,8 @@ fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void {
 pub fn appendFileSystemInput(comp: *Compilation, path: Compilation.Path) Allocator.Error!void {
     const gpa = comp.gpa;
     const fsi = comp.file_system_inputs orelse return;
+    comp.file_system_inputs_mutex.lock();
+    defer comp.file_system_inputs_mutex.unlock();
     const prefixes = comp.cache_parent.prefixes();
 
     const want_prefix_dir: Cache.Directory = switch (path.root) {
@@ -5202,7 +5210,11 @@ fn performAllTheWork(
                 // produce duplicate analyze_func jobs and N-1 workers then
                 // condvar-wait on the one analyzer.
                 const a = zcu.intern_pool.funcAnalysisUnordered(job.analyze_func);
-                if (a.is_analyzed) continue :work;
+                if (a.is_analyzed) {
+                    _ = zcu.psema_skip_done.rmw(.Add, 1, .monotonic);
+                    continue :work;
+                }
+                _ = zcu.psema_dispatched.rmw(.Add, 1, .monotonic);
                 _ = zcu.sema_pending_jobs.rmw(.Add, 1, .acquire);
                 comp.thread_pool.spawnWgId(&comp.link_task_wait_group, workerAnalyzeFunc, .{ comp, job.analyze_func });
                 continue :work;
@@ -5217,7 +5229,19 @@ fn performAllTheWork(
         }
         if (comp.zcu) |zcu| {
             if (zcu.sema_pending_jobs.load(.acquire) > 0) {
-                std.Thread.yield() catch {};
+                // Park until a worker enqueues new work or the last
+                // pending sema job finishes; busy-spinning here contended
+                // `work_queue_mutex` against every `queueJob` call.
+                comp.work_queue_mutex.lock();
+                if (zcu.sema_pending_jobs.load(.acquire) > 0) {
+                    var any: bool = false;
+                    for (&comp.work_queues) |*q| if (q.count > 0) {
+                        any = true;
+                        break;
+                    };
+                    if (!any) comp.work_queue_cond.wait(&comp.work_queue_mutex);
+                }
+                comp.work_queue_mutex.unlock();
                 continue :work;
             }
             // A worker may have enqueued between our queue read and the
@@ -5265,7 +5289,19 @@ fn performAllTheWork(
         }
         break;
     }
-    if (comp.zcu) |zcu| zcu.parallel_sema = false;
+    if (comp.zcu) |zcu| {
+        if (std.process.hasNonEmptyEnvVarConstant("ZIG_PSEMA_STATS")) {
+            std.debug.print("[PSEMA] body_runs={d} yields={d} claim_waits={d} dispatched={d} skip_busy={d} skip_done={d}\n", .{
+                zcu.psema_body_runs.load(.monotonic),
+                zcu.psema_yields.load(.monotonic),
+                zcu.psema_claim_waits.load(.monotonic),
+                zcu.psema_dispatched.load(.monotonic),
+                zcu.psema_skip_busy.load(.monotonic),
+                zcu.psema_skip_done.load(.monotonic),
+            });
+        }
+        zcu.parallel_sema = false;
+    }
     if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) {
         std.debug.print("=== work loop job timings (main thread) ===\n", .{});
         inline for (@typeInfo(Job.Tag).@"enum".fields, 0..) |f, i| {
@@ -5281,6 +5317,7 @@ pub fn queueJob(comp: *Compilation, job: Job) !void {
     comp.work_queue_mutex.lock();
     defer comp.work_queue_mutex.unlock();
     try comp.work_queues[Job.stage(job)].writeItem(job);
+    comp.work_queue_cond.signal();
 }
 
 pub fn queueJobs(comp: *Compilation, jobs: []const Job) !void {
@@ -6148,19 +6185,31 @@ fn workerAnalyzeFunc(tid: usize, comp: *Compilation, func: InternPool.Index) voi
     const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
     defer pt.deactivate();
     Zcu.tls_retry_loop = null;
+    Zcu.tls_retry_dep = null;
     pt.ensureFuncBodyUpToDate(func) catch |err| switch (err) {
         error.OutOfMemory => comp.setAllocFailure(),
         error.AnalysisFail => {
             if (Zcu.tls_retry_loop != null) {
-                // Order-dependent dependency loop: re-queue this func so
-                // another thread (or a later attempt) can try after
-                // intermediates have been resolved independently.
+                _ = zcu.psema_yields.rmw(.Add, 1, .monotonic);
                 Zcu.tls_retry_loop = null;
-                comp.queueJob(.{ .analyze_func = func }) catch comp.setAllocFailure();
+                if (Zcu.tls_retry_dep) |dep| {
+                    Zcu.tls_retry_dep = null;
+                    if (zcu.deferOn(dep, func) catch false) {
+                        _ = zcu.sema_pending_jobs.rmw(.Add, 1, .acquire);
+                    } else {
+                        comp.queueJob(.{ .analyze_func = func }) catch comp.setAllocFailure();
+                    }
+                } else {
+                    comp.queueJob(.{ .analyze_func = func }) catch comp.setAllocFailure();
+                }
             }
         },
     };
-    _ = zcu.sema_pending_jobs.rmw(.Sub, 1, .release);
+    if (zcu.sema_pending_jobs.rmw(.Sub, 1, .release) == 1) {
+        comp.work_queue_mutex.lock();
+        comp.work_queue_cond.signal();
+        comp.work_queue_mutex.unlock();
+    }
 }
 
 fn workerZcuCodegen(
diff --git a/src/InternPool.zig b/src/InternPool.zig
index 768b0b6adcce..0853b1c7c703 100644
--- a/src/InternPool.zig
+++ b/src/InternPool.zig
@@ -7878,7 +7878,14 @@ pub const wip_namespace_sentinel: u32 = std.math.maxInt(u32);
 /// `NamespaceIndex` (0 is a valid index).
 pub const cancelled_namespace_sentinel: u32 = std.math.maxInt(u32) - 1;
 
-pub const NamespaceTypeAwaitResult = enum { finished, cancelled };
+pub const NamespaceTypeAwaitResult = enum {
+    finished,
+    cancelled,
+    /// Another thread holds the wip and the caller is not the same-thread
+    /// owner. Caller should yield-and-requeue (set `tls_retry_loop`) instead
+    /// of spinning, since it may hold a `unit_claim` the wip owner needs.
+    would_block,
+};
 
 /// Spin until `ty`'s namespace slot is no longer the wip sentinel. Returns
 /// `.cancelled` if the wip owner invoked `cancel` (slot now holds
@@ -7886,8 +7893,12 @@ pub const NamespaceTypeAwaitResult = enum { finished, cancelled };
 /// retry the originating `get*Type` call, which will skip the now-`.removed`
 /// map entry and allocate fresh.
 pub fn awaitNamespaceTypeFinished(ip: *const InternPool, ty: Index) NamespaceTypeAwaitResult {
+    return ip.awaitNamespaceTypeFinishedBounded(ty, std.math.maxInt(u32));
+}
+pub fn awaitNamespaceTypeFinishedBounded(ip: *const InternPool, ty: Index, max_spins: u32) NamespaceTypeAwaitResult {
     const ns_idx = ip.namespaceTypeNamespaceExtraIndex(ty) orelse return .finished;
     const unwrapped = ty.unwrap(ip);
+    var spins: u32 = 0;
     while (true) {
         // Re-acquire the shared view each iteration: the owning tid may
         // realloc its extra array between `getStructType` and `finish`, which
@@ -7896,6 +7907,8 @@ pub fn awaitNamespaceTypeFinished(ip: *const InternPool, ty: Index) NamespaceTyp
         const slot: *const u32 = &extra.view().items(.@"0")[ns_idx];
         const loaded = @atomicLoad(u32, slot, .acquire);
         if (loaded == wip_namespace_sentinel) {
+            spins += 1;
+            if (spins >= max_spins) return .would_block;
             std.atomic.spinLoopHint();
             continue;
         }
@@ -10646,6 +10659,7 @@ pub fn getIfExists(ip: *const InternPool, key: Key) ?Index {
         const index = entry.acquire();
         if (index == .none) return null;
         if (entry.hash != hash) continue;
+        if (index.unwrap(ip).getTag(ip) == .removed) continue;
         if (ip.indexToKey(index).eql(key, ip)) return index;
     }
 }
diff --git a/src/Package/Fetch.zig b/src/Package/Fetch.zig
index 3d2cc4cfb8ca..9225d9193802 100644
--- a/src/Package/Fetch.zig
+++ b/src/Package/Fetch.zig
@@ -739,7 +739,6 @@ fn queueJobsForDeps(f: *Fetch) RunError!void {
         // for fetching.
 
         for (dep_names, deps) |dep_name, dep| {
-            var promoted_existing_to_eager = false;
             const new_fetch = &new_fetches[new_fetch_index];
             const location: Location = switch (dep.location) {
                 .url => |url| .{
@@ -750,13 +749,8 @@ fn queueJobsForDeps(f: *Fetch) RunError!void {
                             const pkg_hash: Package.Hash = .fromSlice(h);
                             if (h.len == 0) break :h pkg_hash;
                             const gop = f.job_queue.table.getOrPutAssumeCapacity(pkg_hash);
-                            if (gop.found_existing) {
-                                if (!dep.lazy and gop.value_ptr.*.lazy_status != .eager) {
-                                    gop.value_ptr.*.lazy_status = .eager;
-                                    promoted_existing_to_eager = true;
-                                } else {
-                                    continue;
-                                }
+                            if (gop.found_existing and (dep.lazy or gop.value_ptr.*.lazy_status == .eager)) {
+                                continue;
                             }
                             gop.value_ptr.* = new_fetch;
                             break :h pkg_hash;
@@ -769,13 +763,8 @@ fn queueJobsForDeps(f: *Fetch) RunError!void {
                     const new_root = try f.package_root.resolvePosix(parent_arena, rel_path);
                     const pkg_hash = relativePathDigest(new_root, cache_root);
                     const gop = f.job_queue.table.getOrPutAssumeCapacity(pkg_hash);
-                    if (gop.found_existing) {
-                        if (!dep.lazy and gop.value_ptr.*.lazy_status != .eager) {
-                            gop.value_ptr.*.lazy_status = .eager;
-                            promoted_existing_to_eager = true;
-                        } else {
-                            continue;
-                        }
+                    if (gop.found_existing and (dep.lazy or gop.value_ptr.*.lazy_status == .eager)) {
+                        continue;
                     }
                     gop.value_ptr.* = new_fetch;
                     break :l .{ .relative_path = new_root };
@@ -783,9 +772,7 @@ fn queueJobsForDeps(f: *Fetch) RunError!void {
             };
             prog_names[new_fetch_index] = dep_name;
             new_fetch_index += 1;
-            if (!promoted_existing_to_eager) {
-                f.job_queue.all_fetches.appendAssumeCapacity(new_fetch);
-            }
+            f.job_queue.all_fetches.appendAssumeCapacity(new_fetch);
             new_fetch.* = .{
                 .arena = std.heap.ArenaAllocator.init(gpa),
                 .location = location,
diff --git a/src/Sema.zig b/src/Sema.zig
index d66c1ef26367..16efca216cec 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -3007,6 +3007,10 @@ fn zirStructDecl(
                 // claim a fresh wip ourselves.
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
@@ -3255,6 +3259,10 @@ fn zirEnumDecl(
             switch (zcu.awaitNamespaceTypeFinished(ty)) {
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
@@ -3424,6 +3432,10 @@ fn zirUnionDecl(
             switch (zcu.awaitNamespaceTypeFinished(ty)) {
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             const new_ty = try pt.ensureTypeUpToDate(ty);
 
@@ -3527,6 +3539,10 @@ fn zirOpaqueDecl(
             switch (zcu.awaitNamespaceTypeFinished(ty)) {
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             // Make sure we update the namespace if the declaration is re-analyzed, to pick
             // up on e.g. changed comptime decls.
@@ -13895,9 +13911,9 @@ fn zirEmbedFile(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     };
     try sema.declareDependency(.{ .embed_file = ef_idx });
 
-    zcu.semaLock();
+    zcu.embed_mutex.lock();
     const result = ef_idx.get(zcu).*;
-    zcu.semaUnlock();
+    zcu.embed_mutex.unlock();
     if (result.val == .none) {
         return sema.fail(block, operand_src, "unable to open '{s}': {s}", .{ name, @errorName(result.err.?) });
     }
@@ -19915,6 +19931,10 @@ fn structInitAnon(
         .existing => |ty| switch (zcu.awaitNamespaceTypeFinished(ty)) {
             .cancelled => continue :gop,
             .finished => break :gop ty,
+            .would_block => {
+                Zcu.tls_retry_loop = sema.owner;
+                return error.AnalysisFail;
+            },
         },
     };
     try sema.declareDependency(.{ .interned = struct_ty });
@@ -20913,6 +20933,10 @@ fn zirReify(
                     switch (zcu.awaitNamespaceTypeFinished(ty)) {
                         .cancelled => continue :gop,
                         .finished => {},
+                        .would_block => {
+                            Zcu.tls_retry_loop = sema.owner;
+                            return error.AnalysisFail;
+                        },
                     }
                     try sema.addTypeReferenceEntry(src, ty);
                     return Air.internedToRef(ty);
@@ -21132,6 +21156,10 @@ fn reifyEnum(
             switch (zcu.awaitNamespaceTypeFinished(ty)) {
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             try sema.declareDependency(.{ .interned = ty });
             try sema.addTypeReferenceEntry(src, ty);
@@ -21304,6 +21332,10 @@ fn reifyUnion(
             switch (zcu.awaitNamespaceTypeFinished(ty)) {
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             try sema.declareDependency(.{ .interned = ty });
             try sema.addTypeReferenceEntry(src, ty);
@@ -21661,6 +21693,10 @@ fn reifyStruct(
             switch (zcu.awaitNamespaceTypeFinished(ty)) {
                 .cancelled => continue :gop,
                 .finished => {},
+                .would_block => {
+                    Zcu.tls_retry_loop = sema.owner;
+                    return error.AnalysisFail;
+                },
             }
             try sema.declareDependency(.{ .interned = ty });
             try sema.addTypeReferenceEntry(src, ty);
@@ -35286,28 +35322,12 @@ fn resolveInferredErrorSet(
         // corresponds to the function, not one created to track an inline/comptime call.
         const orig_func_index = ip.unwrapCoercedFunc(func_index);
         const ies_unit: AnalUnit = .wrap(.{ .func = orig_func_index });
-        if (zcu.parallel_sema and zcu.isClaimedByOther(ies_unit)) {
-            // The IES owner's body is being analysed by another worker right
-            // now. `ensureFuncBodyUpToDate` would park this thread on
-            // `sema_claim_cond` until that worker finishes — under chained
-            // IES dependencies that idles a core for the bulk of Sema.
-            // Instead, yield: re-queue `sema.owner` and let this thread
-            // pull the next job. Cap retries low because each one re-runs
-            // the caller's body from scratch; once the cap is hit, fall
-            // through to the blocking wait.
-            zcu.sema_retry_mutex.lock();
-            const tries: u8 = blk: {
-                const gop = zcu.sema_retry_counts.getOrPut(zcu.gpa, ies_unit) catch break :blk 255;
-                if (!gop.found_existing) gop.value_ptr.* = 0;
-                gop.value_ptr.* +|= 1;
-                break :blk gop.value_ptr.*;
-            };
-            zcu.sema_retry_mutex.unlock();
-            if (tries < 8) {
-                Zcu.tls_retry_loop = sema.owner;
-                return error.AnalysisFail;
-            }
-        }
+        // The nested `ensureFuncBodyUpToDate` blocks via `claimOrWait` if
+        // another worker is already analysing this body. With `sema_lock`
+        // eliminated under parallel non-incremental and the thundering-herd
+        // broadcast skipped when `claim_waits` is empty, the wait is a single
+        // futex; yielding here instead re-runs the *caller's* entire body on
+        // requeue, which dominated CPU at high core counts.
         try sema.addReferenceEntry(block, src, ies_unit);
         try pt.ensureFuncBodyUpToDate(orig_func_index);
     }
diff --git a/src/Type.zig b/src/Type.zig
index 04b2d0bb7100..bfb403598091 100644
--- a/src/Type.zig
+++ b/src/Type.zig
@@ -2837,10 +2837,14 @@ pub fn comptimeOnlyInner(
 
                 return switch (strat) {
                     .normal => switch (struct_type.requiresComptime(ip)) {
-                        .wip => unreachable,
+                        // Under parallel Sema another worker may be in the
+                        // `.sema` arm right now (`setRequiresComptimeWip`
+                        // → `.wip`, or the field-types-wip fallback →
+                        // `.unknown`). The documented contract permits a
+                        // false negative; only unreachable in serial.
+                        .wip, .unknown => if (zcu.parallel_sema) false else unreachable,
                         .no => false,
                         .yes => true,
-                        .unknown => unreachable,
                     },
                     .sema => switch (struct_type.setRequiresComptimeWip(ip)) {
                         .no, .wip => false,
@@ -2904,10 +2908,9 @@ pub fn comptimeOnlyInner(
                 const union_type = ip.loadUnionType(ty.toIntern());
                 return switch (strat) {
                     .normal => switch (union_type.requiresComptime(ip)) {
-                        .wip => unreachable,
+                        .wip, .unknown => if (zcu.parallel_sema) false else unreachable,
                         .no => false,
                         .yes => true,
-                        .unknown => unreachable,
                     },
                     .sema => switch (union_type.setRequiresComptimeWip(ip)) {
                         .no, .wip => return false,
@@ -3039,7 +3042,7 @@ pub fn getNamespace(ty: Type, zcu: *Zcu) InternPool.OptionalNamespaceIndex {
     // Callers reach here only with indices that have already passed the
     // `.existing` retry loop in Sema (or are owned by this thread via
     // `tls_wip_types`), so `.cancelled` is not expected.
-    _ = zcu.awaitNamespaceTypeFinished(ty.toIntern());
+    _ = zcu.awaitNamespaceTypeFinishedSpin(ty.toIntern());
     return switch (ip.indexToKey(ty.toIntern())) {
         .opaque_type => ip.loadOpaqueType(ty.toIntern()).namespace.toOptional(),
         .struct_type => ip.loadStructType(ty.toIntern()).namespace.toOptional(),
@@ -3173,7 +3176,7 @@ pub fn enumFieldIndex(ty: Type, field_name: InternPool.NullTerminatedString, zcu
     const ip = &zcu.intern_pool;
     // The `.existing` dedup may return an enum whose `WipEnumType` owner is
     // still populating names; spin until prepare() so the lookup sees them.
-    _ = Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
+    _ = Zcu.awaitNamespaceTypeFinishedSpin(zcu, ty.toIntern());
     const enum_type = ip.loadEnumType(ty.toIntern());
     return enum_type.nameIndex(ip, field_name);
 }
@@ -3183,7 +3186,7 @@ pub fn enumFieldIndex(ty: Type, field_name: InternPool.NullTerminatedString, zcu
 /// declaration order, or `null` if `enum_tag` does not match any field.
 pub fn enumTagFieldIndex(ty: Type, enum_tag: Value, zcu: *const Zcu) ?u32 {
     const ip = &zcu.intern_pool;
-    _ = Zcu.awaitNamespaceTypeFinishedConst(zcu, ty.toIntern());
+    _ = Zcu.awaitNamespaceTypeFinishedSpin(zcu, ty.toIntern());
     const enum_type = ip.loadEnumType(ty.toIntern());
     const int_tag = switch (ip.indexToKey(enum_tag.toIntern())) {
         .int => enum_tag.toIntern(),
@@ -3915,7 +3918,13 @@ fn resolveStructInner(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
-    _ = zcu.awaitNamespaceTypeFinished(ty.toIntern());
+    switch (zcu.awaitNamespaceTypeFinished(ty.toIntern())) {
+        .finished, .cancelled => {},
+        .would_block => {
+            Zcu.tls_retry_loop = .wrap(.{ .type = ty.toIntern() });
+            return error.AnalysisFail;
+        },
+    }
 
     const ip = &zcu.intern_pool;
     const struct_obj = zcu.typeToStruct(ty).?;
@@ -3952,8 +3961,9 @@ fn resolveStructInner(
     }
     defer if (owns_claim) zcu.releaseClaim(owner);
 
-    zcu.semaLock();
-    defer zcu.semaUnlock();
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
     if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
     if (zcu.comp.debugIncremental()) {
         const info = try zcu.incremental_debug_state.getUnitInfo(gpa, owner);
@@ -4010,7 +4020,13 @@ fn resolveUnionInner(
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
 
-    _ = zcu.awaitNamespaceTypeFinished(ty.toIntern());
+    switch (zcu.awaitNamespaceTypeFinished(ty.toIntern())) {
+        .finished, .cancelled => {},
+        .would_block => {
+            Zcu.tls_retry_loop = .wrap(.{ .type = ty.toIntern() });
+            return error.AnalysisFail;
+        },
+    }
 
     const ip = &zcu.intern_pool;
     const union_obj = zcu.typeToUnion(ty).?;
@@ -4039,8 +4055,9 @@ fn resolveUnionInner(
     }
     defer if (owns_claim) zcu.releaseClaim(owner);
 
-    zcu.semaLock();
-    defer zcu.semaUnlock();
+    const need_sema_lock = !zcu.parallel_sema or zcu.comp.incremental;
+    if (need_sema_lock) zcu.semaLock();
+    defer if (need_sema_lock) zcu.semaUnlock();
     if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
     if (zcu.comp.debugIncremental()) {
         const info = try zcu.incremental_debug_state.getUnitInfo(gpa, owner);
diff --git a/src/Zcu.zig b/src/Zcu.zig
index e2c2ae53ac60..764f9eae4717 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -79,26 +79,30 @@ codegen_prog_node: std.Progress.Node = .none,
 sema_lock: std.Thread.Mutex = .{},
 sema_lock_owner: std.atomic.Value(std.Thread.Id) = .init(no_sema_owner),
 sema_lock_depth: u32 = 0,
-/// Signalled whenever a claim in `unit_claims` is released.
-sema_claim_cond: std.Thread.Condition = .{},
-/// AnalUnits currently being analysed by some worker; value is the owning tid.
-/// Guarded by `sema_lock`. A worker that finds an entry here for a unit it
-/// needs waits on `sema_claim_cond` until the entry is removed.
-unit_claims: std.AutoHashMapUnmanaged(AnalUnit, std.Thread.Id) = .empty,
+/// Per-unit claim state, sharded by `claimShardIndex(unit)` so the
+/// `tryClaim`/`claimOrWait`/`releaseClaim` hot path (one call per
+/// analyzed body, ~180k for behavior tests, plus ~250k nested waits) only
+/// contends with units that hash to the same shard.
+unit_claim_shards: [unit_claim_shard_count]UnitClaimShard = @splat(.{}),
 /// Tracks which unit each thread is currently waiting on, for deadlock
-/// detection in `claimOrWait`. Guarded by `sema_lock`.
+/// detection in `claimOrWait`. Guarded by `claim_waits_mutex`; separate
+/// from the shards because the chain walk crosses shards.
 claim_waits: std.AutoHashMapUnmanaged(std.Thread.Id, AnalUnit) = .empty,
+claim_waits_mutex: std.Thread.Mutex = .{},
 /// Per-unit retry count for order-dependent dependency loops, to avoid
 /// livelock on a true source-level cycle. Guarded by `sema_lock`.
 sema_retry_counts: std.AutoHashMapUnmanaged(AnalUnit, u8) = .empty,
 sema_pending_jobs: std.atomic.Value(u32) = .init(0),
+/// Debug counters; printed under ZIG_PSEMA_STATS.
+psema_body_runs: std.atomic.Value(u64) = .init(0),
+psema_yields: std.atomic.Value(u64) = .init(0),
+psema_claim_waits: std.atomic.Value(u64) = .init(0),
+psema_dispatched: std.atomic.Value(u64) = .init(0),
+psema_skip_busy: std.atomic.Value(u64) = .init(0),
+psema_skip_done: std.atomic.Value(u64) = .init(0),
 /// Guards `inline_reference_frames` / `free_inline_reference_frames` so that
 /// the very hot `Inlining.refFrame` path does not contend on `sema_lock`.
 inline_ref_mutex: std.Thread.Mutex = .{},
-/// Guards `unit_claims` / `claim_waits` so `claimOrWait` does not contend on
-/// `sema_lock` (the entry-lock at ensureFuncBodyUpToDate was the hottest
-/// contention site at 12.4 s × 15 684 stalls).
-unit_claims_mutex: std.Thread.Mutex = .{},
 /// Guards `failed_analysis` + `transitive_failed_analysis`.
 failed_analysis_mutex: std.Thread.Mutex = .{},
 /// Guards `reference_table` / `all_references` / `free_references` and the
@@ -121,6 +125,13 @@ test_functions_mutex: std.Thread.Mutex = .{},
 cimport_errors_mutex: std.Thread.Mutex = .{},
 /// Guards `sema_retry_counts`.
 sema_retry_mutex: std.Thread.Mutex = .{},
+/// Funcs whose analysis yielded waiting on the keyed unit. When the key
+/// unit's claim is released, these are re-queued (not before — re-running
+/// the caller's body before the dependency is resolved just yields again).
+/// Guards `embed_table` and the `*EmbedFile` payloads it owns.
+embed_mutex: std.Thread.Mutex = .{},
+/// Guards `global_assembly`.
+global_assembly_mutex: std.Thread.Mutex = .{},
 /// Guards `compile_logs` + `compile_log_lines` + `free_compile_log_lines`.
 compile_log_mutex: std.Thread.Mutex = .{},
 /// True while parallel Sema is enabled for this update.
@@ -872,6 +883,11 @@ pub const Namespace = struct {
     /// All `test` declarations in this namespace. We store these purely so that incremental
     /// compilation can re-use the existing `Nav`s when a namespace changes.
     test_decls: std.ArrayListUnmanaged(InternPool.Nav.Index) = .empty,
+    /// Guards `pub_decls`/`priv_decls`/`comptime_decls`/`test_decls`/`generation`
+    /// under `parallel_sema`. `scanNamespace` is the only writer; readers are
+    /// `Sema.lookupInNamespace` and namespace iterators. Per-namespace so two
+    /// independent type decls don't serialise on the global `sema_lock`.
+    decls_mutex: std.Thread.Mutex = .{},
 
     pub const Index = InternPool.NamespaceIndex;
     pub const OptionalIndex = InternPool.OptionalNamespaceIndex;
@@ -2859,7 +2875,11 @@ pub fn deinit(zcu: *Zcu) void {
         for (zcu.failed_codegen.values()) |value| value.destroy(gpa);
         for (zcu.failed_types.values()) |value| value.destroy(gpa);
         zcu.analysis_in_progress.deinit(gpa);
-        zcu.unit_claims.deinit(gpa);
+        for (&zcu.unit_claim_shards) |*s| {
+            s.map.deinit(gpa);
+            for (s.deferred.values()) |*v| v.deinit(gpa);
+            s.deferred.deinit(gpa);
+        }
         zcu.claim_waits.deinit(gpa);
         zcu.sema_retry_counts.deinit(gpa);
         zcu.failed_analysis.deinit(gpa);
@@ -3623,6 +3643,26 @@ pub const ImportResult = struct {
 
 pub const no_sema_owner: std.Thread.Id = std.math.maxInt(std.Thread.Id);
 
+pub const unit_claim_shard_count = 256;
+pub const UnitClaimShard = struct {
+    mutex: std.Thread.Mutex = .{},
+    /// Paired with `mutex`; signalled by `releaseClaim` for any unit in this
+    /// shard so waiters don't share a single global condvar.
+    cond: std.Thread.Condition = .{},
+    /// Units in this shard currently being analysed; value is the owning tid.
+    map: std.AutoHashMapUnmanaged(AnalUnit, std.Thread.Id) = .empty,
+    /// Funcs whose analysis yielded waiting on a unit in this shard.
+    deferred: std.AutoArrayHashMapUnmanaged(AnalUnit, std.ArrayListUnmanaged(InternPool.Index)) = .empty,
+    /// Count of threads currently parked on `cond`.
+    waiters: u32 = 0,
+};
+pub fn claimShardIndex(unit: AnalUnit) u8 {
+    return @truncate(std.hash.int(@as(u64, @bitCast(unit))));
+}
+pub fn claimShard(zcu: *Zcu, unit: AnalUnit) *UnitClaimShard {
+    return &zcu.unit_claim_shards[claimShardIndex(unit)];
+}
+
 /// Recursive acquire of `sema_lock` if parallel Sema is active. No-op otherwise.
 pub fn semaLock(zcu: *Zcu) void {
     if (!zcu.parallel_sema) return;
@@ -3682,6 +3722,20 @@ pub fn awaitNamespaceTypeFinished(zcu: *Zcu, ty: InternPool.Index) InternPool.Na
     return awaitNamespaceTypeFinishedConst(zcu, ty);
 }
 pub fn awaitNamespaceTypeFinishedConst(zcu: *const Zcu, ty: InternPool.Index) InternPool.NamespaceTypeAwaitResult {
+    if (!zcu.parallel_sema) return .finished;
+    if (tls_wip_types.contains(ty)) return .finished;
+    // Another thread holds the wip. Spinning here while holding a
+    // `unit_claim` deadlocks if the wip owner's `resolveDeclaredEnum` (or
+    // similar) needs that claim. Check once and return `.would_block` so the
+    // caller yields-and-requeues; the requeue's queue depth provides natural
+    // backoff. Callers that cannot propagate `.would_block` (Type.getNamespace
+    // — reached only with already-finished types per its callers' contract)
+    // use `awaitNamespaceTypeFinishedSpin` instead.
+    return zcu.intern_pool.awaitNamespaceTypeFinishedBounded(ty, 1);
+}
+/// Unbounded spin for callers that cannot propagate `.would_block` (no
+/// `sema.owner` in scope to requeue). Prefer `awaitNamespaceTypeFinished`.
+pub fn awaitNamespaceTypeFinishedSpin(zcu: *const Zcu, ty: InternPool.Index) InternPool.NamespaceTypeAwaitResult {
     if (!zcu.parallel_sema) return .finished;
     if (tls_wip_types.contains(ty)) return .finished;
     return zcu.intern_pool.awaitNamespaceTypeFinished(ty);
@@ -3690,68 +3744,143 @@ pub fn awaitNamespaceTypeFinishedConst(zcu: *const Zcu, ty: InternPool.Index) In
 /// Try to claim `unit` for analysis on behalf of `tid`. Returns:
 ///  - `.claimed` if the caller now owns analysis of this unit and must call
 ///    `releaseClaim` when done.
-///  - `.recursed` if this thread already owns it (dependency-loop detection
-///    handled by caller as before via `analysis_in_progress`).
+///  - `.recursed` if this thread already owns it, or a cross-thread wait
+///    chain leads back to a unit this thread holds.
 ///  - `.done` if another thread finished analysing it while we waited; caller
 ///    should re-read the unit's resolved status and return.
-/// Uses its own `unit_claims_mutex`; may temporarily release any held
-/// `sema_lock` while waiting on the per-unit condvar.
+/// Locks only `unit`'s shard for the hot path; `claim_waits_mutex` is taken
+/// only when actually parking (rare relative to total calls).
 pub fn claimOrWait(zcu: *Zcu, unit: AnalUnit) Allocator.Error!enum { claimed, recursed, done } {
     if (!zcu.parallel_sema) return .claimed;
     const me = std.Thread.getCurrentId();
-    zcu.unit_claims_mutex.lock();
-    defer zcu.unit_claims_mutex.unlock();
+    const shard = zcu.claimShard(unit);
+    shard.mutex.lock();
+    defer shard.mutex.unlock();
     while (true) {
-        const gop = try zcu.unit_claims.getOrPut(zcu.gpa, unit);
+        const gop = try shard.map.getOrPut(zcu.gpa, unit);
         if (!gop.found_existing) {
             gop.value_ptr.* = me;
             return .claimed;
         }
         if (gop.value_ptr.* == me) return .recursed;
-        var chain_unit = unit;
-        var hops: u32 = 0;
-        while (hops < 64) : (hops += 1) {
-            const holder = zcu.unit_claims.get(chain_unit) orelse break;
-            if (holder == me) return .recursed;
-            chain_unit = zcu.claim_waits.get(holder) orelse break;
-        }
-        // Another thread holds the claim; record our wait, fully release any
-        // held sema_lock, then sleep on the dedicated claims condvar.
-        try zcu.claim_waits.put(zcu.gpa, me, unit);
-        zcu.unit_claims_mutex.unlock();
-        const d = zcu.semaRelease();
-        zcu.unit_claims_mutex.lock();
-        // Lost-wakeup guard: holder may have released between our two locks.
-        if (zcu.unit_claims.contains(unit))
-            zcu.sema_claim_cond.wait(&zcu.unit_claims_mutex);
+        // Cycle detection: walk holder → its waited-on unit → that unit's
+        // holder, until we either reach ourselves (cycle) or a thread that
+        // isn't waiting. The chain crosses shards, so peek under each shard's
+        // mutex; lock order is shard(unit) then claim_waits then transient
+        // shard(chain_unit), and the transient lock is released before the
+        // next hop, so no two shard mutexes are held simultaneously.
+        if (zcu.detectClaimCycle(shard, gop.value_ptr.*, me)) return .recursed;
+        _ = zcu.psema_claim_waits.rmw(.Add, 1, .monotonic);
+        zcu.claim_waits_mutex.lock();
+        zcu.claim_waits.put(zcu.gpa, me, unit) catch {};
+        zcu.claim_waits_mutex.unlock();
+        // Under parallel non-incremental, `sema_lock` is never held so the
+        // wait is a single shard.cond round-trip. Under incremental,
+        // `parallel_sema` is gated off (`ensure*UpToDate` set
+        // `need_sema_lock = !parallel_sema || incremental` and the dispatch
+        // loop only spawns when `parallel_sema`), so d == 0 here always.
+        std.debug.assert(zcu.sema_lock_owner.load(.acquire) != me);
+        shard.waiters += 1;
+        shard.cond.wait(&shard.mutex);
+        shard.waiters -= 1;
+        zcu.claim_waits_mutex.lock();
         _ = zcu.claim_waits.remove(me);
-        zcu.unit_claims_mutex.unlock();
-        zcu.semaReacquire(d);
-        zcu.unit_claims_mutex.lock();
-        // After wake, check whether the unit is now resolved; if the claim is
-        // gone, another thread finished it.
-        if (!zcu.unit_claims.contains(unit)) return .done;
+        zcu.claim_waits_mutex.unlock();
+        if (!shard.map.contains(unit)) return .done;
+    }
+}
+
+/// Walk the wait chain from `first_holder` and report whether it reaches `me`.
+fn detectClaimCycle(zcu: *Zcu, held_shard: *UnitClaimShard, first_holder: std.Thread.Id, me: std.Thread.Id) bool {
+    zcu.claim_waits_mutex.lock();
+    defer zcu.claim_waits_mutex.unlock();
+    var holder = first_holder;
+    var hops: u32 = 0;
+    while (hops < unit_claim_shard_count) : (hops += 1) {
+        const next_unit = zcu.claim_waits.get(holder) orelse return false;
+        const next_shard = zcu.claimShard(next_unit);
+        if (next_shard == held_shard) {
+            // Our shard (caller already holds its mutex): read directly.
+            holder = next_shard.map.get(next_unit) orelse return false;
+        } else if (next_shard.mutex.tryLock()) {
+            defer next_shard.mutex.unlock();
+            holder = next_shard.map.get(next_unit) orelse return false;
+        } else {
+            // A different shard is contended. Reading its map without the
+            // lock could fault mid-rehash; conservatively report no-cycle and
+            // retry on the next wake (a true cycle is stable so the next walk
+            // sees it; a false negative just delays detection one round).
+            return false;
+        }
+        if (holder == me) return true;
     }
+    return false;
 }
 
 pub fn releaseClaim(zcu: *Zcu, unit: AnalUnit) void {
     if (!zcu.parallel_sema) return;
-    zcu.unit_claims_mutex.lock();
-    _ = zcu.unit_claims.remove(unit);
-    zcu.unit_claims_mutex.unlock();
-    zcu.sema_claim_cond.broadcast();
+    const shard = zcu.claimShard(unit);
+    shard.mutex.lock();
+    _ = shard.map.remove(unit);
+    const have_waiters = shard.waiters != 0;
+    var deferred_list: std.ArrayListUnmanaged(InternPool.Index) = if (shard.deferred.fetchSwapRemove(unit)) |kv| kv.value else .empty;
+    shard.mutex.unlock();
+    if (have_waiters) shard.cond.broadcast();
+    for (deferred_list.items) |func| {
+        zcu.comp.queueJob(.{ .analyze_func = func }) catch zcu.comp.setAllocFailure();
+    }
+    if (deferred_list.items.len != 0) {
+        _ = zcu.sema_pending_jobs.rmw(.Sub, @intCast(deferred_list.items.len), .release);
+    }
+    deferred_list.deinit(zcu.gpa);
+}
+
+/// Record that `waiter_func` yielded because `dep_unit` is held by another
+/// thread. `waiter_func` will be re-queued by `releaseClaim(dep_unit)`. Returns
+/// `true` if deferred; `false` if `dep_unit` was no longer claimed by the time
+/// we acquired the mutex, in which case the caller should re-queue immediately.
+pub fn deferOn(zcu: *Zcu, dep_unit: AnalUnit, waiter_func: InternPool.Index) Allocator.Error!bool {
+    if (!zcu.parallel_sema) return false;
+    const shard = zcu.claimShard(dep_unit);
+    shard.mutex.lock();
+    defer shard.mutex.unlock();
+    if (!shard.map.contains(dep_unit)) return false;
+    const gop = try shard.deferred.getOrPut(zcu.gpa, dep_unit);
+    if (!gop.found_existing) gop.value_ptr.* = .empty;
+    try gop.value_ptr.append(zcu.gpa, waiter_func);
+    return true;
+}
+
+/// Non-blocking variant of `claimOrWait`. Never sleeps; returns `.busy` if
+/// another thread holds the claim. Top-level dispatch (workerAnalyzeFunc) uses
+/// this so a worker that picks a duplicate-dispatched unit returns to the pool
+/// immediately instead of parking on `cond` for the duration of the holder's
+/// analysis. Nested callers that need the result still go through
+/// `claimOrWait` so its chain-walk catches cross-thread cycles.
+pub fn tryClaim(zcu: *Zcu, unit: AnalUnit) Allocator.Error!enum { claimed, recursed, busy } {
+    if (!zcu.parallel_sema) return .claimed;
+    const me = std.Thread.getCurrentId();
+    const shard = zcu.claimShard(unit);
+    shard.mutex.lock();
+    defer shard.mutex.unlock();
+    const gop = try shard.map.getOrPut(zcu.gpa, unit);
+    if (!gop.found_existing) {
+        gop.value_ptr.* = me;
+        return .claimed;
+    }
+    if (gop.value_ptr.* == me) return .recursed;
+    return .busy;
 }
 
 /// Returns true if `unit` is currently claimed for analysis by a thread other
-/// than the caller. Used by `Sema.resolveInferredErrorSet` to yield-and-requeue
-/// instead of parking on `sema_claim_cond` when a dependency IES is already in
-/// progress on another worker.
+/// than the caller.
 pub fn isClaimedByOther(zcu: *Zcu, unit: AnalUnit) bool {
     if (!zcu.parallel_sema) return false;
     const me = std.Thread.getCurrentId();
-    zcu.unit_claims_mutex.lock();
-    defer zcu.unit_claims_mutex.unlock();
-    const owner = zcu.unit_claims.get(unit) orelse return false;
+    const shard = zcu.claimShard(unit);
+    shard.mutex.lock();
+    defer shard.mutex.unlock();
+    const owner = shard.map.get(unit) orelse return false;
     return owner != me;
 }
 
@@ -3762,11 +3891,23 @@ threadlocal var tls_aip: std.AutoArrayHashMapUnmanaged(AnalUnit, void) = .empty;
 /// resolvable by another thread). Consumed by the outer `ensure*UpToDate`
 /// to release-and-requeue instead of marking the unit failed.
 pub threadlocal var tls_retry_loop: ?AnalUnit = null;
+/// When `tls_retry_loop` is set because a dependency is claimed by another
+/// thread, this names that dependency so the requeue can be deferred until it
+/// completes (via `deferOn`/`releaseClaim`) instead of re-spawning immediately
+/// and re-running the caller's body just to yield again at the same point.
+pub threadlocal var tls_retry_dep: ?AnalUnit = null;
 
 pub fn semaAipContains(zcu: *Zcu, unit: AnalUnit) bool {
     if (!zcu.parallel_sema) return zcu.analysis_in_progress.contains(unit);
     return tls_aip.contains(unit);
 }
+/// True at top-level dispatch (workerAnalyzeFunc) before any nested ensure*
+/// has pushed onto `tls_aip`. Used to choose `tryClaim` (skip-on-busy) vs the
+/// blocking `claimOrWait` path in `ensureFuncBodyUpToDate`.
+pub fn semaAipEmpty(zcu: *Zcu) bool {
+    if (!zcu.parallel_sema) return zcu.analysis_in_progress.count() == 0;
+    return tls_aip.count() == 0;
+}
 
 pub fn dumpTlsAip(zcu: *Zcu) void {
     std.debug.print("tls_aip ({d} entries):\n", .{tls_aip.count()});
@@ -4104,8 +4245,8 @@ pub fn failedAnalysisGetOrPut(zcu: *Zcu, unit: AnalUnit, msg: *ErrorMsg) Allocat
 
 pub fn addGlobalAssembly(zcu: *Zcu, unit: AnalUnit, source: []const u8) !void {
     const gpa = zcu.gpa;
-    zcu.semaLock();
-    defer zcu.semaUnlock();
+    zcu.global_assembly_mutex.lock();
+    defer zcu.global_assembly_mutex.unlock();
     const gop = try zcu.global_assembly.getOrPut(gpa, unit);
     if (gop.found_existing) {
         const new_value = try std.fmt.allocPrint(gpa, "{s}\n{s}", .{ gop.value_ptr.*, source });
@@ -4777,6 +4918,12 @@ fn formatDependee(data: FormatDependee, writer: *std.io.Writer) std.io.Writer.Er
 /// Given the `InternPool.Index` of a function, set its resolved IES to `.none` if it
 /// may be outdated. `Sema` should do this before ever loading a resolved IES.
 pub fn maybeUnresolveIes(zcu: *Zcu, func_index: InternPool.Index) !void {
+    // `outdated`/`potentially_outdated` are incremental-only state. Under
+    // parallel non-incremental Sema they are conceptually empty, but
+    // `scanDecl` still touches `outdated`/`outdated_ready` for comptime
+    // units under `outdated_mutex`; an unlocked `contains()` here can read
+    // mid-rehash. Short-circuit before the unlocked read.
+    if (zcu.parallel_sema and !zcu.comp.incremental) return;
     const unit = AnalUnit.wrap(.{ .func = func_index });
     if (zcu.outdated.contains(unit) or zcu.potentially_outdated.contains(unit)) {
         // We're consulting the resolved IES now, but the function is outdated, so its
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index 5c1ad65091fd..98ce9f8585cc 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -611,8 +611,12 @@ pub fn ensureFileAnalyzed(pt: Zcu.PerThread, file_index: Zcu.File.Index) Zcu.Sem
             else => |e| return e,
         }
     }
-    pt.zcu.semaLock();
-    defer pt.zcu.semaUnlock();
+    // File-root creation is once-per-file; serialise on `comp.mutex` (same
+    // lock `discoverImport` uses for file registration) so two threads
+    // importing the same module don't both run `semaFile` and hit the
+    // `.existing => unreachable` in `createFileRootStruct`.
+    pt.zcu.comp.mutex.lock();
+    defer pt.zcu.comp.mutex.unlock();
     if (pt.zcu.fileRootType(file_index) != .none) return;
     return pt.semaFile(file_index);
 }
@@ -1762,10 +1766,22 @@ pub fn ensureFuncBodyUpToDate(pt: Zcu.PerThread, func_index: InternPool.Index) Z
         return;
     }
 
-    // `claimOrWait` self-locks `unit_claims_mutex`; we only take the global
-    // `sema_lock` after the claim succeeds, so the (very hot) entry path no
-    // longer contends on `sema_lock`.
-    claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
+    // Top-level dispatch (workerAnalyzeFunc, no enclosing analysis on this
+    // thread): if another worker already holds this unit, return immediately.
+    // The work queue may dispatch the same func to several workers (re-queues
+    // from the retry path, plus discovery from multiple callers); only one
+    // analysis is needed. Parking N-1 workers on `sema_claim_cond` for the
+    // duration was the dominant idle cost at high core counts.
+    if (zcu.parallel_sema and zcu.semaAipEmpty()) {
+        switch (try zcu.tryClaim(anal_unit)) {
+            .claimed => {},
+            .recursed => return error.AnalysisFail,
+            .busy => {
+                _ = zcu.psema_skip_busy.rmw(.Add, 1, .monotonic);
+                return;
+            },
+        }
+    } else claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
         .claimed => break :claim,
         .recursed => return error.AnalysisFail,
         .done => {
@@ -2071,7 +2087,7 @@ fn semaFile(pt: Zcu.PerThread, file_index: Zcu.File.Index) Zcu.SemaError!void {
     errdefer zcu.intern_pool.remove(pt.tid, struct_ty);
 
     if (zcu.comp.time_report) |*tr| {
-        tr.stats.n_imported_files += 1;
+        _ = @atomicRmw(u32, &tr.stats.n_imported_files, .Add, 1, .monotonic);
     }
 }
 
@@ -2550,8 +2566,8 @@ pub fn embedFile(
 
     // `embed_table` and `EmbedFile` allocation are shared state accessed
     // from the carve-out under parallel Sema.
-    zcu.semaLock();
-    defer zcu.semaUnlock();
+    zcu.embed_mutex.lock();
+    defer zcu.embed_mutex.unlock();
 
     const opt_mod: ?*Module = m: {
         if (mem.eql(u8, import_string, "std")) break :m zcu.std_mod;
@@ -2752,8 +2768,8 @@ pub fn scanNamespace(
     const gpa = zcu.gpa;
     const namespace = zcu.namespacePtr(namespace_index);
 
-    zcu.semaLock();
-    defer zcu.semaUnlock();
+    namespace.decls_mutex.lock();
+    defer namespace.decls_mutex.unlock();
     // Another thread may have already scanned this namespace (e.g. via
     // ensureNamespaceUpToDate before the creator reached its own scan).
     if (zcu.parallel_sema and namespace.generation == zcu.generation) return;
@@ -2999,6 +3015,7 @@ fn analyzeFnBodyInner(pt: Zcu.PerThread, func_index: InternPool.Index) Zcu.SemaE
     defer tracy.end();
 
     const zcu = pt.zcu;
+    _ = zcu.psema_body_runs.rmw(.Add, 1, .monotonic);
     const gpa = zcu.gpa;
     const ip = &zcu.intern_pool;
 
@@ -3018,7 +3035,7 @@ fn analyzeFnBodyInner(pt: Zcu.PerThread, func_index: InternPool.Index) Zcu.SemaE
 
     if (zcu.comp.time_report) |*tr| {
         if (func.generic_owner != .none) {
-            tr.stats.n_generic_instances += 1;
+            _ = @atomicRmw(u32, &tr.stats.n_generic_instances, .Add, 1, .monotonic);
         }
     }
 
@@ -4441,8 +4458,13 @@ pub fn ensureNamespaceUpToDate(pt: Zcu.PerThread, namespace_index: Zcu.Namespace
 
     if (zcu.parallel_sema and @atomicLoad(u32, &namespace.generation, .acquire) == zcu.generation) return;
 
-    zcu.semaLock();
-    defer zcu.semaUnlock();
+    // Decl-map exclusion is provided by `scanNamespace` taking
+    // `namespace.decls_mutex` (and re-checking `generation` under it). The
+    // span from here to the `scanNamespace` call only reads InternPool/ZIR to
+    // compute `decls`, so two threads racing to here is wasted work but safe:
+    // the second `scanNamespace` early-returns on the locked `generation`
+    // check. Under incremental, `parallel_sema` is gated off so this path is
+    // single-threaded.
     if (namespace.generation == zcu.generation) return;
 
     const Container = enum { @"struct", @"union", @"enum", @"opaque" };
@@ -4457,7 +4479,7 @@ pub fn ensureNamespaceUpToDate(pt: Zcu.PerThread, namespace_index: Zcu.Namespace
     const key = switch (full_key) {
         .reified, .generated_tag => {
             // Namespace always empty, so up-to-date.
-            namespace.generation = zcu.generation;
+            @atomicStore(u32, &namespace.generation, zcu.generation, .release);
             return;
         },
         .declared => |d| d,
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 97d28bc5579c..ccae903823b6 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -1257,8 +1257,11 @@ pub const Object = struct {
         // var error_message: [*:0]const u8 = undefined;
 
         // Convert bin_path_list to NULL-terminated C array if provided
+        var null_term_buf: ?[]?[*:0]const u8 = null;
+        defer if (null_term_buf) |buf| comp.gpa.free(buf);
         const bin_filename_list: ?[*:null]const ?[*:0]const u8 = if (options.bin_path_list) |list| blk: {
             const null_term = try comp.gpa.alloc(?[*:0]const u8, list.len + 1);
+            null_term_buf = null_term;
             for (list, 0..) |path, i| {
                 null_term[i] = path;
             }
diff --git a/src/libs/libcxx.zig b/src/libs/libcxx.zig
index 33dbb5703d48..9c300acc35bc 100644
--- a/src/libs/libcxx.zig
+++ b/src/libs/libcxx.zig
@@ -522,18 +522,18 @@ pub fn addCxxArgs(
         abi_version,
     }));
     try cflags.append(try std.fmt.allocPrint(arena, "-D_LIBCPP_HAS_THREADS={d}", .{
-        @as(u1, if (comp.config.any_non_single_threaded) 1 else 0),
+        @intFromBool(comp.config.any_non_single_threaded),
     }));
     try cflags.append("-D_LIBCPP_HAS_MONOTONIC_CLOCK");
     try cflags.append("-D_LIBCPP_HAS_TERMINAL");
     try cflags.append(try std.fmt.allocPrint(arena, "-D_LIBCPP_HAS_MUSL_LIBC={d}", .{
-        @as(u1, if (target.abi.isMusl()) 1 else 0),
+        @intFromBool(target.abi.isMusl()),
     }));
     try cflags.append("-D_LIBCXXABI_DISABLE_VISIBILITY_ANNOTATIONS");
     try cflags.append("-D_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS");
     try cflags.append("-D_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS=0");
     try cflags.append(try std.fmt.allocPrint(arena, "-D_LIBCPP_HAS_FILESYSTEM={d}", .{
-        @as(u1, if (target.os.tag == .wasi) 0 else 1),
+        @intFromBool(target.os.tag != .wasi),
     }));
     try cflags.append("-D_LIBCPP_HAS_RANDOM_DEVICE");
     try cflags.append("-D_LIBCPP_HAS_LOCALIZATION");
diff --git a/src/link/Elf.zig b/src/link/Elf.zig
index 44af8627277f..9adb86427657 100644
--- a/src/link/Elf.zig
+++ b/src/link/Elf.zig
@@ -771,33 +771,12 @@ fn flushInner(self: *Elf, arena: Allocator, tid: Zcu.PerThread.Id) !void {
         return;
     }
 
-    const zcu_obj_path: ?Path = if (self.base.zcu_object_basename) |raw| p: {
-        break :p try comp.resolveEmitPathFlush(arena, .temp, raw);
-    } else null;
-
     if (self.zigObjectPtr()) |zig_object| try zig_object.flush(self, tid);
 
-    // Parse LLVM-generated object file(s)
-    if (zcu_obj_path) |path| {
-        const partition_count = self.base.zcu_object_partition_count;
-        if (partition_count > 1) {
-            // Parallel codegen: parse all partition files
-            const base_path = path.sub_path;
-            const base_name = if (std.mem.endsWith(u8, base_path, ".o"))
-                base_path[0 .. base_path.len - 2]
-            else
-                base_path;
-
-            for (0..partition_count) |i| {
-                const partition_path: Path = .{
-                    .root_dir = path.root_dir,
-                    .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}.o", .{ base_name, i }),
-                };
-                openParseObjectReportingFailure(self, partition_path);
-            }
-        } else {
-            openParseObjectReportingFailure(self, path);
-        }
+    // Parse LLVM-generated object file(s); helper expands to N partition paths
+    // when parallel codegen produced multiple shards.
+    for (try self.base.resolveZcuObjectPaths(arena)) |path| {
+        openParseObjectReportingFailure(self, path);
     }
 
     switch (comp.config.output_mode) {
diff --git a/src/link/MachO.zig b/src/link/MachO.zig
index 76c63a3c0920..906258ebc842 100644
--- a/src/link/MachO.zig
+++ b/src/link/MachO.zig
@@ -633,24 +633,9 @@ pub fn appendZcuObjectInputs(
     positionals: *std.array_list.Managed(link.Input),
     zcu_obj_path: ?Path,
 ) !void {
+    _ = zcu_obj_path;
     const diags = &self.base.comp.link_diags;
-    const path = zcu_obj_path orelse return;
-    const partition_count = self.base.zcu_object_partition_count;
-    if (partition_count > 1) {
-        const base_path = path.sub_path;
-        const base_name = if (std.mem.endsWith(u8, base_path, ".o"))
-            base_path[0 .. base_path.len - 2]
-        else
-            base_path;
-
-        for (0..partition_count) |i| {
-            const partition_path: Path = .{
-                .root_dir = path.root_dir,
-                .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}.o", .{ base_name, i }),
-            };
-            try positionals.append(try link.openObjectInput(diags, partition_path));
-        }
-    } else {
+    for (try self.base.resolveZcuObjectPaths(arena)) |path| {
         try positionals.append(try link.openObjectInput(diags, path));
     }
 }
diff --git a/src/main.zig b/src/main.zig
index 3c6790420163..854b6517d92e 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -181,8 +181,11 @@ pub fn main() anyerror!void {
             break :gpa .{ std.heap.raw_c_allocator, false };
         }
         break :gpa switch (builtin.mode) {
-            .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true },
-            .ReleaseFast, .ReleaseSmall => .{ std.heap.smp_allocator, false },
+            .Debug => .{ debug_allocator.allocator(), true },
+            // ReleaseSafe keeps runtime safety checks but uses smp_allocator:
+            // debug_allocator's single mutex serialises every allocation
+            // across all threads, which defeats parallel Sema/codegen.
+            .ReleaseSafe, .ReleaseFast, .ReleaseSmall => .{ std.heap.smp_allocator, false },
         };
     };
     defer if (is_debug) {
diff --git a/src/target.zig b/src/target.zig
index 48a58522a2aa..54f654184d46 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -502,7 +502,7 @@ pub fn defaultUnwindTables(target: *const std.Target, libunwind: bool, libtsan:
     if (target.os.tag.isDarwin()) return .async;
     if (libunwind) return .async;
     if (libtsan) return .async;
-    if (libasan) return .@"async";
+    if (libasan) return .async;
     if (std.debug.Dwarf.abi.supportsUnwinding(target)) return .async;
     return .none;
 }
diff --git a/src/zig_llvm.cpp b/src/zig_llvm.cpp
index e439d20b7331..e7cef0b90d68 100644
--- a/src/zig_llvm.cpp
+++ b/src/zig_llvm.cpp
@@ -227,17 +227,6 @@ static SanitizerCoverageOptions getSanCovOptions(ZigLLVMCoverageOptions z) {
     o.CollectControlFlow = z.CollectControlFlow;
     return o;
 }
-static AddressSanitizerOptions getAsanOptions(void) {
-    AddressSanitizerOptions o;
-    o.CompileKernel = false;
-    o.Recover = false;
-    o.UseAfterScope = false;
-    o.UseAfterReturn = AsanDetectStackUseAfterReturnMode::Always;
-    o.InstrumentationWithCallsThreshold = 7000;
-    o.MaxInlinePoisoningSize = 64;
-    o.InsertVersionCheck = true;
-    return o;
-}
 
 // Builds and runs the full middle-end optimization pipeline on `llvm_module`.
 // Self-contained so it can be invoked once on the whole module (serial path) or

From bf19f0a7d413f0be338b614e828c49a154b1e02b Mon Sep 17 00:00:00 2001
From: root <root@ip-10-0-2-234.us-west-2.compute.internal>
Date: Sun, 19 Apr 2026 23:11:32 +0000
Subject: [PATCH 08/15] std.Build.Step.Compile: add getEmittedBinShards()

Exposes the per-shard object paths when llvm_no_merge_shards is set,
so a build.zig can install/consume them directly instead of waiting
for the single-threaded relocatable -r merge into one object.
---
 lib/std/Build/Step/Compile.zig | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/lib/std/Build/Step/Compile.zig b/lib/std/Build/Step/Compile.zig
index b9cc970264f4..8f1853024640 100644
--- a/lib/std/Build/Step/Compile.zig
+++ b/lib/std/Build/Step/Compile.zig
@@ -898,6 +898,32 @@ pub fn getEmittedBin(compile: *Compile) LazyPath {
     return compile.getEmittedFileGeneric(&compile.generated_bin);
 }
 
+/// Returns the per-shard object paths when `llvm_no_merge_shards` is set.
+/// Shard `i` lives at `{dir}/{stem}.{i}.o` where `dir` is the emitted-bin
+/// directory and `stem` is `out_filename` with a trailing `.o` stripped. The
+/// returned slice has `llvm_codegen_threads` entries, allocated from the
+/// build arena.
+///
+/// Intended use: `addObject` is configured with `llvm_codegen_threads > 1`
+/// and `llvm_no_merge_shards = true`; the consumer (an executable's link
+/// step, or `addInstallFile`) iterates this slice instead of calling
+/// `getEmittedBin()` (which points at a stub the compiler deletes).
+pub fn getEmittedBinShards(compile: *Compile) []std.Build.LazyPath {
+    assert(compile.llvm_no_merge_shards);
+    assert(compile.llvm_codegen_threads > 1);
+    const b = compile.step.owner;
+    const dir = compile.getEmittedBinDirectory();
+    const stem = if (std.mem.endsWith(u8, compile.out_filename, ".o"))
+        compile.out_filename[0 .. compile.out_filename.len - 2]
+    else
+        compile.out_filename;
+    const out = b.allocator.alloc(std.Build.LazyPath, compile.llvm_codegen_threads) catch @panic("OOM");
+    for (out, 0..) |*p, i| {
+        p.* = dir.path(b, b.fmt("{s}.{d}.o", .{ stem, i }));
+    }
+    return out;
+}
+
 /// Returns the path to the generated import library.
 /// This function can only be called for libraries.
 pub fn getEmittedImplib(compile: *Compile) LazyPath {

From 6666f326e5ea3258abe686209ec40c8050176a8c Mon Sep 17 00:00:00 2001
From: root <root@ip-10-0-2-234.us-west-2.compute.internal>
Date: Mon, 20 Apr 2026 00:03:22 +0000
Subject: [PATCH 09/15] build: link mimalloc into the compiler; -Dmimalloc-obj
 option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

musl's malloc has a single global rwlock. With N parallel LLVM contexts
(--llvm-codegen-threads=N) every operator new from the bitcode reader
and pass pipeline serialises on it — 270M futex calls compiling bun
at cg=64, ~120s wall in emit alone.

Add -Dmimalloc-obj=PATH to build.zig and have the bun_build workflow
compile oven-sh/mimalloc (bun-dev3-v2) static.c with MI_MALLOC_OVERRIDE
for the target, then link the object into the final cross-compiled zig.
mimalloc's per-thread heaps reduce the futex count ~630x; bun's zig
step on Linux/64c goes ~132s → ~23s incremental.
---
 .github/workflows/bun_build.yaml | 24 ++++++++++++++++++++++++
 build.zig                        |  9 +++++++++
 2 files changed, 33 insertions(+)

diff --git a/.github/workflows/bun_build.yaml b/.github/workflows/bun_build.yaml
index 3b9085ae47df..1910acdc5ddc 100644
--- a/.github/workflows/bun_build.yaml
+++ b/.github/workflows/bun_build.yaml
@@ -41,6 +41,30 @@ jobs:
         if: matrix.safe == 'true'
       - run: sed -i 's/max_rss = 7_800_000_000/max_rss = 10_000_000_000/' zig/build.zig
         if: matrix.safe == 'true'
+      - name: Fetch mimalloc
+        uses: actions/checkout@v4
+        with:
+          repository: oven-sh/mimalloc
+          ref: bun-dev3-v2
+          path: mimalloc
+      - name: Splice mimalloc into bootstrap build
+        # musl's malloc has a single global rwlock; with N parallel LLVM
+        # contexts (--llvm-codegen-threads=N) every `operator new` serialises
+        # on it. mimalloc has per-thread heaps. Compile its unity-build
+        # static.c (with MI_OVERRIDE so it replaces malloc/free) for the
+        # target after the host zig exists, then link the object into the
+        # final cross-compiled zig. Inserted just before the final
+        # `$ZIG build` (the `cd "$ROOTDIR/zig"` line) so $ZIG is available.
+        run: |
+          sed -i '/^cd "\$ROOTDIR\/zig"$/i \
+          MI_MUSL=$(case "$TARGET" in *-linux-musl*) echo "-DMI_LIBC_MUSL=1";; esac)\
+          $ZIG cc -c "$ROOTDIR/mimalloc/src/static.c" \\\
+            -I "$ROOTDIR/mimalloc/include" \\\
+            -target $TARGET -mcpu=$MCPU -O2 -fno-builtin -DNDEBUG -Wno-date-time \\\
+            -DMI_MALLOC_OVERRIDE=1 $MI_MUSL -DMI_STATIC_LIB \\\
+            -o "$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"\
+          ' build
+          sed -i 's#-Dversion-string="$ZIG_VERSION"#-Dversion-string="$ZIG_VERSION" -Dmimalloc-obj="$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"#' build
       - run: cat build
       - name: Cache host toolchain
         uses: actions/cache@v4
diff --git a/build.zig b/build.zig
index 5054244a2559..c8ed8ad21a72 100644
--- a/build.zig
+++ b/build.zig
@@ -129,6 +129,14 @@ pub fn build(b: *std.Build) !void {
         "llvm-has-polly",
         "Whether LLVM was built with Polly and requires linking it",
     ) orelse false;
+    const mimalloc_obj = b.option(
+        []const u8,
+        "mimalloc-obj",
+        "Path to a mimalloc static.c object built with MI_OVERRIDE; linked " ++
+            "into the compiler so libc malloc (musl's single-lock allocator " ++
+            "in static builds) is replaced. LLVM emit at high codegen-thread " ++
+            "counts otherwise serialises on the malloc lock.",
+    );
     const enable_ios_sdk = b.option(bool, "enable-ios-sdk", "Run tests requiring presence of iOS SDK and frameworks") orelse false;
     const enable_macos_sdk = b.option(bool, "enable-macos-sdk", "Run tests requiring presence of macOS SDK and frameworks") orelse enable_ios_sdk;
     const enable_symlinks_windows = b.option(bool, "enable-symlinks-windows", "Run tests requiring presence of symlinks on Windows") orelse false;
@@ -208,6 +216,7 @@ pub fn build(b: *std.Build) !void {
     });
     exe.pie = pie;
     exe.entitlements = entitlements;
+    if (mimalloc_obj) |p| exe.addObjectFile(.{ .cwd_relative = p });
 
     const use_llvm = b.option(bool, "use-llvm", "Use the llvm backend");
     exe.use_llvm = use_llvm;

From 4f58f6e24a5b1c7b669e4fb25ce9eabe2274390a Mon Sep 17 00:00:00 2001
From: root <root@ip-10-0-2-234.us-west-2.compute.internal>
Date: Mon, 20 Apr 2026 01:21:53 +0000
Subject: [PATCH 10/15] ci: gate mimalloc splice to linux-musl targets only

---
 .github/workflows/bun_build.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/bun_build.yaml b/.github/workflows/bun_build.yaml
index 1910acdc5ddc..45a0f5995425 100644
--- a/.github/workflows/bun_build.yaml
+++ b/.github/workflows/bun_build.yaml
@@ -47,7 +47,7 @@ jobs:
           repository: oven-sh/mimalloc
           ref: bun-dev3-v2
           path: mimalloc
-      - name: Splice mimalloc into bootstrap build
+      - name: Splice mimalloc into bootstrap build (linux-musl only)
         # musl's malloc has a single global rwlock; with N parallel LLVM
         # contexts (--llvm-codegen-threads=N) every `operator new` serialises
         # on it. mimalloc has per-thread heaps. Compile its unity-build
@@ -55,13 +55,14 @@ jobs:
         # target after the host zig exists, then link the object into the
         # final cross-compiled zig. Inserted just before the final
         # `$ZIG build` (the `cd "$ROOTDIR/zig"` line) so $ZIG is available.
+        # macOS/Windows mallocs are already per-thread; only musl needs this.
+        if: contains(matrix.target, 'linux-musl')
         run: |
           sed -i '/^cd "\$ROOTDIR\/zig"$/i \
-          MI_MUSL=$(case "$TARGET" in *-linux-musl*) echo "-DMI_LIBC_MUSL=1";; esac)\
           $ZIG cc -c "$ROOTDIR/mimalloc/src/static.c" \\\
             -I "$ROOTDIR/mimalloc/include" \\\
             -target $TARGET -mcpu=$MCPU -O2 -fno-builtin -DNDEBUG -Wno-date-time \\\
-            -DMI_MALLOC_OVERRIDE=1 $MI_MUSL -DMI_STATIC_LIB \\\
+            -DMI_MALLOC_OVERRIDE=1 -DMI_LIBC_MUSL=1 -DMI_STATIC_LIB \\\
             -o "$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"\
           ' build
           sed -i 's#-Dversion-string="$ZIG_VERSION"#-Dversion-string="$ZIG_VERSION" -Dmimalloc-obj="$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"#' build

From af6e006bec4494b5f716b1508e7113fc2210ed67 Mon Sep 17 00:00:00 2001
From: root <root@ip-10-0-2-234.us-west-2.compute.internal>
Date: Mon, 20 Apr 2026 07:09:53 +0000
Subject: [PATCH 11/15] psema: split claimOrWait .recursed/.cycle;
 resolveNavType seqlock; nav_map+OptBisect races; hoist FuncInstance prep out
 of 4-shard lock

claimOrWait now distinguishes same-thread reentry (.recursed -> dependency
loop diagnostic) from cross-thread cycle (.cycle -> yield-and-requeue);
the previous bare AnalysisFail silently markTransitiveFailed both units
with no error and could surface as a processExportsInner unreachable.
resolveNavType brackets type/linksection writes with bits.writing so
getNav cannot tear (mirrors resolveNavValue). ensureExportFuncQueued
locks the shard mutex around nav_map.contains. ensureNavValAnalysisQueued
unlocks nav_queued_mutex before queueJob so it doesn't nest under
work_queue_mutex. getFuncInstanceIes precomputes the instance nav's
name/fqn/mods before lockShardsSorted so the 4-shard critical section
holds only createNav+owner_nav write. OptBisect is per-thread.
---
 src/Compilation.zig   |  8 +++-
 src/InternPool.zig    | 88 ++++++++++++++++++++++++++++++++++---------
 src/Type.zig          |  8 ++++
 src/Zcu.zig           | 29 +++++++++-----
 src/Zcu/PerThread.zig | 20 ++++++++++
 src/zig_llvm.cpp      |  7 +++-
 6 files changed, 131 insertions(+), 29 deletions(-)

diff --git a/src/Compilation.zig b/src/Compilation.zig
index 2987ec256f43..6f3d3727c59a 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -6168,9 +6168,15 @@ fn ensureExportFuncQueued(zcu: *Zcu, export_idx: Zcu.Export.Index) bool {
     };
     if (!ip.isFuncBody(v)) return false;
     const func = ip.unwrapCoercedFunc(v);
-    // Check the LLVM nav_map: if the body landed there, codegen ran.
+    // Check the LLVM nav_map: if the body landed there, codegen ran. The
+    // shard mutex must be held — `workerZcuCodegen` may still be running
+    // (link_task_wait_group is not waited until performAllTheWork returns)
+    // and `Object.updateFunc`/`resolveGlobalNav` `getOrPut()` into nav_map
+    // under that mutex; an unlocked `contains()` here can read mid-rehash.
     if (zcu.llvm_object) |llvm| {
         const shard = zcu.navShard(nav, llvm.n);
+        llvm.mutexes[shard].lock();
+        defer llvm.mutexes[shard].unlock();
         if (llvm.objects[shard].nav_map.contains(nav)) return false;
     } else if (ip.funcAnalysisUnordered(func).is_analyzed) return false;
     // Clear is_analyzed so the fast-path doesn't no-op and re-analysis
diff --git a/src/InternPool.zig b/src/InternPool.zig
index 0853b1c7c703..e502347fc592 100644
--- a/src/InternPool.zig
+++ b/src/InternPool.zig
@@ -10096,6 +10096,12 @@ pub fn getFuncInstanceIes(
     const es_key: Key = .{ .inferred_error_set_type = func_index };
     const fty_key: Key = .{ .func_type = extraFuncType(tid, extra.list.*, func_type_extra_index) };
 
+    // Precompute the instance nav's name/fqn so the string interning (fmt +
+    // hash + alloc) is not serialised under the four shard mutexes below.
+    // `func_index`/`generic_owner` are known here; on the `.existing` path the
+    // interned strings are simply unused (string-pool entries are not freed).
+    const ffi_prep = try prepareFuncInstanceNav(ip, gpa, tid, generic_owner, func_index);
+
     // Four shard mutexes are held simultaneously below; lock in sorted order
     // so concurrent callers cannot ABBA-deadlock.
     var locked_shards: [4]u32 = undefined;
@@ -10120,15 +10126,7 @@ pub fn getFuncInstanceIes(
     var func_ty_gop = try ip.getOrPutKeyPrelocked(gpa, tid, fty_key, 0);
     defer func_ty_gop.deinit();
     func_ty_gop.putTentative(func_ty);
-    try finishFuncInstance(
-        ip,
-        gpa,
-        tid,
-        extra,
-        generic_owner,
-        func_index,
-        func_extra_index,
-    );
+    try commitFuncInstanceNav(ip, gpa, tid, extra, func_index, func_extra_index, ffi_prep);
 
     func_gop.putFinal(func_index);
     error_union_type_gop.putFinal(error_union_type);
@@ -10137,15 +10135,27 @@ pub fn getFuncInstanceIes(
     return func_index;
 }
 
-fn finishFuncInstance(
+const FuncInstanceNavPrep = struct {
+    name: NullTerminatedString,
+    fqn: NullTerminatedString,
+    is_const: bool,
+    alignment: Alignment,
+    @"linksection": OptionalNullTerminatedString,
+    @"addrspace": std.builtin.AddressSpace,
+};
+
+/// String interning + owner-nav modifier reads for `commitFuncInstanceNav`.
+/// Hoisted out of `getFuncInstanceIes` so the (fmt + hash + alloc) cost is
+/// not paid while holding up to four shard mutexes; at high core counts that
+/// serialised every generic instantiation behind whichever shards happened
+/// to collide.
+fn prepareFuncInstanceNav(
     ip: *InternPool,
     gpa: Allocator,
     tid: Zcu.PerThread.Id,
-    extra: Local.Extra.Mutable,
     generic_owner: Index,
     func_index: Index,
-    func_extra_index: u32,
-) Allocator.Error!void {
+) Allocator.Error!FuncInstanceNavPrep {
     const fn_owner_nav = ip.getNav(ip.funcDeclInfo(generic_owner).owner_nav);
     const fn_namespace = fn_owner_nav.analysis.?.namespace;
 
@@ -10165,22 +10175,57 @@ fn finishFuncInstance(
         // state; a genuine `.unresolved` cannot reach instantiation.
         .unresolved => unreachable,
     };
-    const nav_index = try ip.createNav(gpa, tid, .{
+    return .{
         .name = nav_name,
         .fqn = try ip.namespacePtr(fn_namespace).internFullyQualifiedName(ip, gpa, tid, nav_name),
-        .val = func_index,
         .is_const = owner_mods[0],
         .alignment = owner_mods[1],
         .@"linksection" = owner_mods[2],
         .@"addrspace" = owner_mods[3],
-    });
+    };
+}
 
+/// Create the instance Nav and publish it into the func's `owner_nav` slot.
+/// Must run while the func-key shard lock is held: `putTentative` already
+/// published `func_index`, so a concurrent reader that acquires the shard
+/// after we release would otherwise observe `owner_nav == undefined`.
+fn commitFuncInstanceNav(
+    ip: *InternPool,
+    gpa: Allocator,
+    tid: Zcu.PerThread.Id,
+    extra: Local.Extra.Mutable,
+    func_index: Index,
+    func_extra_index: u32,
+    prep: FuncInstanceNavPrep,
+) Allocator.Error!void {
+    const nav_index = try ip.createNav(gpa, tid, .{
+        .name = prep.name,
+        .fqn = prep.fqn,
+        .val = func_index,
+        .is_const = prep.is_const,
+        .alignment = prep.alignment,
+        .@"linksection" = prep.@"linksection",
+        .@"addrspace" = prep.@"addrspace",
+    });
     // Populate the owner_nav field which was left undefined until now.
     extra.view().items(.@"0")[
         func_extra_index + std.meta.fieldIndex(Tag.FuncInstance, "owner_nav").?
     ] = @intFromEnum(nav_index);
 }
 
+fn finishFuncInstance(
+    ip: *InternPool,
+    gpa: Allocator,
+    tid: Zcu.PerThread.Id,
+    extra: Local.Extra.Mutable,
+    generic_owner: Index,
+    func_index: Index,
+    func_extra_index: u32,
+) Allocator.Error!void {
+    const prep = try prepareFuncInstanceNav(ip, gpa, tid, generic_owner, func_index);
+    try commitFuncInstanceNav(ip, gpa, tid, extra, func_index, func_extra_index, prep);
+}
+
 pub const EnumTypeInit = struct {
     has_values: bool,
     tag_mode: LoadedEnumType.TagMode,
@@ -12033,10 +12078,19 @@ pub fn resolveNavType(
     assert(nav_analysis_namespace[unwrapped.index] != .none);
     assert(nav_analysis_zir_index[unwrapped.index] != .none);
 
+    // Seqlock-style write paired with the loop in `getNav`: invalidate `bits`
+    // before mutating `type_or_val` so a concurrent reader cannot pair the old
+    // status with the new payload (it sees b1 != b2 and retries). Mirrors
+    // `resolveNavValue` — without this prelude, an unresolved→type_resolved
+    // re-resolution under incremental could tear.
+    var bits = nav_bits[unwrapped.index];
+    bits.writing = true;
+    @atomicStore(Nav.Repr.Bits, &nav_bits[unwrapped.index], bits, .release);
+
     @atomicStore(InternPool.Index, &nav_types[unwrapped.index], resolved.type, .release);
     @atomicStore(OptionalNullTerminatedString, &nav_linksections[unwrapped.index], resolved.@"linksection", .release);
 
-    var bits = nav_bits[unwrapped.index];
+    bits.writing = false;
     bits.status = if (resolved.is_extern_decl) .type_resolved_extern_decl else .type_resolved;
     bits.is_const = resolved.is_const;
     bits.alignment = resolved.alignment;
diff --git a/src/Type.zig b/src/Type.zig
index bfb403598091..c474a44276bb 100644
--- a/src/Type.zig
+++ b/src/Type.zig
@@ -3953,6 +3953,10 @@ fn resolveStructInner(
                 break :claim;
             },
             .recursed => break :claim,
+            .cycle => {
+                Zcu.tls_retry_loop = owner;
+                return error.AnalysisFail;
+            },
             .done => {
                 if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
                 continue :claim;
@@ -4047,6 +4051,10 @@ fn resolveUnionInner(
                 break :claim;
             },
             .recursed => break :claim,
+            .cycle => {
+                Zcu.tls_retry_loop = owner;
+                return error.AnalysisFail;
+            },
             .done => {
                 if (zcu.anyAnalysisFailed(owner)) return error.AnalysisFail;
                 continue :claim;
diff --git a/src/Zcu.zig b/src/Zcu.zig
index 764f9eae4717..6bbc1af2f845 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -3597,12 +3597,16 @@ pub fn ensureNavValAnalysisQueued(zcu: *Zcu, nav_id: InternPool.Nav.Index) !void
 
     if (zcu.parallel_sema and !zcu.comp.incremental) {
         if (ip.getNav(nav_id).status == .fully_resolved) return;
-        zcu.nav_queued_mutex.lock();
-        defer zcu.nav_queued_mutex.unlock();
-        if (zcu.nav_val_analysis_queued.contains(nav_id)) return;
-        try zcu.nav_val_analysis_queued.ensureUnusedCapacity(zcu.gpa, 1);
-        try zcu.comp.queueJob(.{ .analyze_comptime_unit = .wrap(.{ .nav_val = nav_id }) });
-        zcu.nav_val_analysis_queued.putAssumeCapacityNoClobber(nav_id, {});
+        // Decide under `nav_queued_mutex`, then queue outside it so the
+        // dispatch loop's hot spin on `work_queue_mutex` (taken by
+        // `queueJob`) can't stall threads waiting on `nav_queued_mutex`.
+        const should_queue = sq: {
+            zcu.nav_queued_mutex.lock();
+            defer zcu.nav_queued_mutex.unlock();
+            const gop = try zcu.nav_val_analysis_queued.getOrPut(zcu.gpa, nav_id);
+            break :sq !gop.found_existing;
+        };
+        if (should_queue) try zcu.comp.queueJob(.{ .analyze_comptime_unit = .wrap(.{ .nav_val = nav_id }) });
         return;
     }
 
@@ -3744,13 +3748,18 @@ pub fn awaitNamespaceTypeFinishedSpin(zcu: *const Zcu, ty: InternPool.Index) Int
 /// Try to claim `unit` for analysis on behalf of `tid`. Returns:
 ///  - `.claimed` if the caller now owns analysis of this unit and must call
 ///    `releaseClaim` when done.
-///  - `.recursed` if this thread already owns it, or a cross-thread wait
-///    chain leads back to a unit this thread holds.
+///  - `.recursed` if this thread already owns it (same-thread reentry).
+///  - `.cycle` if a cross-thread wait chain leads back to a unit this thread
+///    holds. Caller must yield-and-requeue (set `tls_retry_loop`) so the
+///    cycle is broken on the next attempt by one thread nesting both claims
+///    and reaching the same-thread `.recursed` path, which produces the
+///    "dependency loop detected" diagnostic. Returning bare AnalysisFail
+///    here would silently `markTransitiveFailed` both units with no error.
 ///  - `.done` if another thread finished analysing it while we waited; caller
 ///    should re-read the unit's resolved status and return.
 /// Locks only `unit`'s shard for the hot path; `claim_waits_mutex` is taken
 /// only when actually parking (rare relative to total calls).
-pub fn claimOrWait(zcu: *Zcu, unit: AnalUnit) Allocator.Error!enum { claimed, recursed, done } {
+pub fn claimOrWait(zcu: *Zcu, unit: AnalUnit) Allocator.Error!enum { claimed, recursed, cycle, done } {
     if (!zcu.parallel_sema) return .claimed;
     const me = std.Thread.getCurrentId();
     const shard = zcu.claimShard(unit);
@@ -3769,7 +3778,7 @@ pub fn claimOrWait(zcu: *Zcu, unit: AnalUnit) Allocator.Error!enum { claimed, re
         // mutex; lock order is shard(unit) then claim_waits then transient
         // shard(chain_unit), and the transient lock is released before the
         // next hop, so no two shard mutexes are held simultaneously.
-        if (zcu.detectClaimCycle(shard, gop.value_ptr.*, me)) return .recursed;
+        if (zcu.detectClaimCycle(shard, gop.value_ptr.*, me)) return .cycle;
         _ = zcu.psema_claim_waits.rmw(.Add, 1, .monotonic);
         zcu.claim_waits_mutex.lock();
         zcu.claim_waits.put(zcu.gpa, me, unit) catch {};
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index 98ce9f8585cc..a6a0c5942647 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -651,6 +651,10 @@ pub fn ensureMemoizedStateUpToDate(pt: Zcu.PerThread, stage: InternPool.Memoized
     claim: while (true) switch (try zcu.claimOrWait(unit)) {
         .claimed => break :claim,
         .recursed => return error.AnalysisFail,
+        .cycle => {
+            Zcu.tls_retry_loop = unit;
+            return error.AnalysisFail;
+        },
         .done => {
             if (zcu.anyAnalysisFailed(unit)) return error.AnalysisFail;
             // The previous holder may have released its claim via a retry-abort
@@ -822,6 +826,10 @@ pub fn ensureComptimeUnitUpToDate(pt: Zcu.PerThread, cu_id: InternPool.ComptimeU
     switch (try zcu.claimOrWait(anal_unit)) {
         .claimed => {},
         .recursed => return error.AnalysisFail,
+        .cycle => {
+            Zcu.tls_retry_loop = anal_unit;
+            return error.AnalysisFail;
+        },
         .done => {
             if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
             return;
@@ -1025,6 +1033,10 @@ pub fn ensureNavValUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zcu
     claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
         .claimed => break :claim,
         .recursed => return error.AnalysisFail,
+        .cycle => {
+            Zcu.tls_retry_loop = anal_unit;
+            return error.AnalysisFail;
+        },
         .done => {
             if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
             // The previous holder may have released its claim via a retry-abort
@@ -1503,6 +1515,10 @@ pub fn ensureNavTypeUpToDate(pt: Zcu.PerThread, nav_id: InternPool.Nav.Index) Zc
     claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
         .claimed => break :claim,
         .recursed => return error.AnalysisFail,
+        .cycle => {
+            Zcu.tls_retry_loop = anal_unit;
+            return error.AnalysisFail;
+        },
         .done => {
             if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
             switch (ip.getNav(nav_id).status) {
@@ -1784,6 +1800,10 @@ pub fn ensureFuncBodyUpToDate(pt: Zcu.PerThread, func_index: InternPool.Index) Z
     } else claim: while (true) switch (try zcu.claimOrWait(anal_unit)) {
         .claimed => break :claim,
         .recursed => return error.AnalysisFail,
+        .cycle => {
+            Zcu.tls_retry_loop = anal_unit;
+            return error.AnalysisFail;
+        },
         .done => {
             if (zcu.anyAnalysisFailed(anal_unit)) return error.AnalysisFail;
             // The previous holder may have released its claim via a retry-abort
diff --git a/src/zig_llvm.cpp b/src/zig_llvm.cpp
index e7cef0b90d68..afd1c76c6cdc 100644
--- a/src/zig_llvm.cpp
+++ b/src/zig_llvm.cpp
@@ -577,7 +577,12 @@ ZIG_EXTERN_C bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machi
 }
 
 void ZigLLVMSetOptBisectLimit(LLVMContextRef context_ref, int limit) {
-    static OptBisect opt_bisect;
+    // Per-thread: with --llvm-codegen-threads>1 each shard has its own
+    // LLVMContext on its own emit thread; a single shared OptBisect would
+    // have N threads racing on LastBisectNum and the count would be
+    // meaningless. thread_local gives each shard a stable, independent
+    // bisection counter.
+    static thread_local OptBisect opt_bisect;
     opt_bisect.setLimit(limit);
     unwrap(context_ref)->setOptPassGate(opt_bisect);
 }

From 4ec9d2c72391adbe5979096871fe4bf745d9747a Mon Sep 17 00:00:00 2001
From: Dylan Conway <dylan.conway567@gmail.com>
Date: Wed, 22 Apr 2026 02:42:06 -0700
Subject: [PATCH 12/15] shard: emit COFF shards with .obj extension; cache key
 + msvc bootstrap fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Shard naming uses target.ofmt.fileExt() instead of hardcoded ".o" in
  Compilation.zig, link.zig, and Build/Step/Compile.zig so COFF targets
  get foo.{i}.obj. ELF/Mach-O behaviour unchanged (fileExt() returns ".o").
- Hash llvm_codegen_threads / llvm_no_merge_shards / no_link_obj into the
  cache key — these change the output file set (one merged object vs N
  shard objects), so a stale cache hit would otherwise produce the wrong
  layout.
- std.c: drop const from _msize to match the Windows SDK declaration so
  the C-backend bootstrap (zig2.c) compiles under clang-cl/msvc.

With these, build-obj --no-link --llvm-no-merge-shards --llvm-codegen-threads=N
works for x86_64-windows-msvc; lld-link consumes the shards directly.
---
 lib/std/Build/Step/Compile.zig | 25 +++++++++++++++----------
 lib/std/c.zig                  |  2 +-
 src/Compilation.zig            | 17 +++++++++++++----
 src/link.zig                   |  8 +++++---
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/lib/std/Build/Step/Compile.zig b/lib/std/Build/Step/Compile.zig
index 8f1853024640..ee2c353010a1 100644
--- a/lib/std/Build/Step/Compile.zig
+++ b/lib/std/Build/Step/Compile.zig
@@ -159,12 +159,14 @@ dead_strip_dylibs: bool = false,
 
 /// Number of threads to use for LLVM backend code generation.
 /// 0 means single-threaded (default). > 1 enables parallel codegen.
-/// When enabled, outputs multiple .o files: filename.0.o, filename.1.o, etc.
+/// When enabled, outputs multiple object files: filename.0.o, filename.1.o, etc.
+/// (or .obj on COFF targets).
 llvm_codegen_threads: u32 = 0,
 
 /// Skip the relocatable -r merge of partitioned LLVM output. The shard
-/// objects are emitted directly to `{emit}.{i}.o` for the downstream linker
-/// to consume. Only meaningful when `llvm_codegen_threads > 1`.
+/// objects are emitted directly to `{emit}.{i}.o` (or `.obj` on COFF) for
+/// the downstream linker to consume. Only meaningful when
+/// `llvm_codegen_threads > 1`.
 llvm_no_merge_shards: bool = false,
 
 /// Skip linker step for build-obj - outputs raw LLVM object file(s).
@@ -899,10 +901,11 @@ pub fn getEmittedBin(compile: *Compile) LazyPath {
 }
 
 /// Returns the per-shard object paths when `llvm_no_merge_shards` is set.
-/// Shard `i` lives at `{dir}/{stem}.{i}.o` where `dir` is the emitted-bin
-/// directory and `stem` is `out_filename` with a trailing `.o` stripped. The
-/// returned slice has `llvm_codegen_threads` entries, allocated from the
-/// build arena.
+/// Shard `i` lives at `{dir}/{stem}.{i}{ext}` where `dir` is the emitted-bin
+/// directory, `stem` is `out_filename` with the target's object extension
+/// stripped, and `ext` is that extension (`.o` for ELF/Mach-O, `.obj` for
+/// COFF). The returned slice has `llvm_codegen_threads` entries, allocated
+/// from the build arena.
 ///
 /// Intended use: `addObject` is configured with `llvm_codegen_threads > 1`
 /// and `llvm_no_merge_shards = true`; the consumer (an executable's link
@@ -913,13 +916,15 @@ pub fn getEmittedBinShards(compile: *Compile) []std.Build.LazyPath {
     assert(compile.llvm_codegen_threads > 1);
     const b = compile.step.owner;
     const dir = compile.getEmittedBinDirectory();
-    const stem = if (std.mem.endsWith(u8, compile.out_filename, ".o"))
-        compile.out_filename[0 .. compile.out_filename.len - 2]
+    const target = compile.rootModuleTarget();
+    const obj_ext = target.ofmt.fileExt(target.cpu.arch);
+    const stem = if (std.mem.endsWith(u8, compile.out_filename, obj_ext))
+        compile.out_filename[0 .. compile.out_filename.len - obj_ext.len]
     else
         compile.out_filename;
     const out = b.allocator.alloc(std.Build.LazyPath, compile.llvm_codegen_threads) catch @panic("OOM");
     for (out, 0..) |*p, i| {
-        p.* = dir.path(b, b.fmt("{s}.{d}.o", .{ stem, i }));
+        p.* = dir.path(b, b.fmt("{s}.{d}{s}", .{ stem, i, obj_ext }));
     }
     return out;
 }
diff --git a/lib/std/c.zig b/lib/std/c.zig
index 331b8b0d2c39..593b42380e47 100644
--- a/lib/std/c.zig
+++ b/lib/std/c.zig
@@ -11371,7 +11371,7 @@ const private = struct {
     extern "c" fn getentropy(buffer: [*]u8, size: usize) c_int;
     extern "c" fn arc4random_buf(buf: [*]u8, len: usize) void;
 
-    extern "c" fn _msize(?*const anyopaque) usize;
+    extern "c" fn _msize(?*anyopaque) usize;
     extern "c" fn malloc_size(?*const anyopaque) usize;
     extern "c" fn malloc_usable_size(?*const anyopaque) usize;
     extern "c" fn posix_memalign(memptr: *?*anyopaque, alignment: usize, size: usize) c_int;
diff --git a/src/Compilation.zig b/src/Compilation.zig
index 6f3d3727c59a..ac1871ea983b 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -2188,6 +2188,12 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options
         cache.hash.add(options.emit_llvm_ir != .no);
         cache.hash.add(options.emit_llvm_bc != .no);
         cache.hash.add(options.emit_docs != .no);
+        // Sharded codegen changes the output file *set* (one merged object vs.
+        // N shard objects), so the count and the merge/no-link knobs must be
+        // part of the cache key.
+        cache.hash.add(options.llvm_codegen_threads);
+        cache.hash.add(options.llvm_no_merge_shards);
+        cache.hash.add(options.no_link_obj);
         // TODO audit this and make sure everything is in it
 
         const main_mod = options.main_mod orelse options.root_mod;
@@ -3471,14 +3477,17 @@ fn flush(
                 const list = try arena.alloc([*:0]const u8, num_threads);
                 const base_path_slice = std.mem.sliceTo(base_bin_path.?, 0);
 
-                // Strip .o extension if present
-                const base_name: []const u8 = if (std.mem.endsWith(u8, base_path_slice, ".o"))
-                    base_path_slice[0 .. base_path_slice.len - 2]
+                // Strip the target's object-file extension (.o for ELF/Mach-O,
+                // .obj for COFF) so shards become `{stem}.{i}{ext}`.
+                const target = &comp.root_mod.resolved_target.result;
+                const obj_ext = target.ofmt.fileExt(target.cpu.arch);
+                const base_name: []const u8 = if (std.mem.endsWith(u8, base_path_slice, obj_ext))
+                    base_path_slice[0 .. base_path_slice.len - obj_ext.len]
                 else
                     base_path_slice;
 
                 for (0..num_threads) |i| {
-                    list[i] = (try std.fmt.allocPrintSentinel(arena, "{s}.{d}.o", .{ base_name, i }, 0)).ptr;
+                    list[i] = (try std.fmt.allocPrintSentinel(arena, "{s}.{d}{s}", .{ base_name, i, obj_ext }, 0)).ptr;
                 }
                 break :blk list;
             } else null;
diff --git a/src/link.zig b/src/link.zig
index 248e06db0027..25d7d8c48794 100644
--- a/src/link.zig
+++ b/src/link.zig
@@ -414,15 +414,17 @@ pub const File = struct {
             out[0] = single;
             return out;
         }
+        const target = &base.comp.root_mod.resolved_target.result;
+        const obj_ext = target.ofmt.fileExt(target.cpu.arch);
         const base_path = single.sub_path;
-        const base_name = if (std.mem.endsWith(u8, base_path, ".o"))
-            base_path[0 .. base_path.len - 2]
+        const base_name = if (std.mem.endsWith(u8, base_path, obj_ext))
+            base_path[0 .. base_path.len - obj_ext.len]
         else
             base_path;
         const out = try arena.alloc(Cache.Path, n);
         for (out, 0..) |*p, i| p.* = .{
             .root_dir = single.root_dir,
-            .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}.o", .{ base_name, i }),
+            .sub_path = try std.fmt.allocPrint(arena, "{s}.{d}{s}", .{ base_name, i, obj_ext }),
         };
         return out;
     }

From 4c5f7f5dc921300b9c4cc263857118aa0001aa81 Mon Sep 17 00:00:00 2001
From: Dylan Conway <dylan.conway567@gmail.com>
Date: Wed, 22 Apr 2026 03:36:09 -0700
Subject: [PATCH 13/15] psema: cut Windows allocator + condvar contention;
 mimalloc new/delete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parallel sema + sharded LLVM emit on Windows was wall-clock bound on
two host primitives that the linux/macos paths don't hit:

- gpa = raw_c_allocator when link_libc, which on Windows is
  HeapAlloc(GetProcessHeap()) behind a single critical section. Switch
  release builds to smp_allocator (per-thread heaps backed by the page
  allocator) the same way the no-libc path already does. Debug keeps
  c_allocator so leak tooling stays accurate.
- std.Thread.Condition on Windows wrapped CONDITION_VARIABLE, whose
  Wake* has no userspace "no waiters" fast-path — every
  work_queue_cond.signal() and claim-shard cond.signal() became a
  kernel32 call. Use FutexImpl everywhere; on Windows the Futex layer
  already maps to RtlWaitOnAddress (Win8+). The old WindowsImpl is left
  in place for reference.
- LLVM's own allocations go through C++ operator new, which still hits
  the CRT heap. Add tools/mimalloc_new_delete_override.cpp (mimalloc's
  unity static.c + the replaceable global operators) and a windows-gnu
  splice in the bootstrap workflow mirroring the existing linux-musl
  step. malloc/free themselves stay on the CRT — they can't be
  statically interposed on Windows — but LLVM's hot path is operator
  new, which is replaceable per the standard.

bun debug zig step on a 24-core x86_64-windows-msvc host:

  serial    psema    psema+24sh
  208.2s    165.0s   77.0s        before
  165.1s    151.7s   35.1s        after  (5.9x vs original serial)
---
 .github/workflows/bun_build.yaml       | 21 +++++++++++++++++--
 lib/std/Thread/Condition.zig           |  8 +++++--
 src/main.zig                           | 19 ++++++++++++-----
 tools/mimalloc_new_delete_override.cpp | 29 ++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 9 deletions(-)
 create mode 100644 tools/mimalloc_new_delete_override.cpp

diff --git a/.github/workflows/bun_build.yaml b/.github/workflows/bun_build.yaml
index 45a0f5995425..0478a4b15e5c 100644
--- a/.github/workflows/bun_build.yaml
+++ b/.github/workflows/bun_build.yaml
@@ -47,7 +47,7 @@ jobs:
           repository: oven-sh/mimalloc
           ref: bun-dev3-v2
           path: mimalloc
-      - name: Splice mimalloc into bootstrap build (linux-musl only)
+      - name: Splice mimalloc into bootstrap build (linux-musl)
         # musl's malloc has a single global rwlock; with N parallel LLVM
         # contexts (--llvm-codegen-threads=N) every `operator new` serialises
         # on it. mimalloc has per-thread heaps. Compile its unity-build
@@ -55,7 +55,6 @@ jobs:
         # target after the host zig exists, then link the object into the
         # final cross-compiled zig. Inserted just before the final
         # `$ZIG build` (the `cd "$ROOTDIR/zig"` line) so $ZIG is available.
-        # macOS/Windows mallocs are already per-thread; only musl needs this.
         if: contains(matrix.target, 'linux-musl')
         run: |
           sed -i '/^cd "\$ROOTDIR\/zig"$/i \
@@ -66,6 +65,24 @@ jobs:
             -o "$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"\
           ' build
           sed -i 's#-Dversion-string="$ZIG_VERSION"#-Dversion-string="$ZIG_VERSION" -Dmimalloc-obj="$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"#' build
+      - name: Splice mimalloc into bootstrap build (windows-gnu)
+        # Windows CRT routes operator new -> HeapAlloc(GetProcessHeap()),
+        # which is guarded by a single critical section — same parallel-emit
+        # serialisation as musl. malloc/free can't be statically interposed
+        # on Windows, but C++ operator new/delete are replaceable per the
+        # standard, and LLVM's hot allocations go through them. Compile the
+        # override TU as C++ so the global operators are emitted; mimalloc
+        # itself is compiled-in via #include of its unity-build static.c.
+        if: contains(matrix.target, 'windows-gnu')
+        run: |
+          sed -i '/^cd "\$ROOTDIR\/zig"$/i \
+          $ZIG c++ -c "$ROOTDIR/zig/tools/mimalloc_new_delete_override.cpp" \\\
+            -I "$ROOTDIR/mimalloc/include" -I "$ROOTDIR/mimalloc" \\\
+            -target $TARGET -mcpu=$MCPU -std=c++17 -O2 -fno-builtin \\\
+            -DNDEBUG -Wno-date-time -DMI_STATIC_LIB \\\
+            -o "$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"\
+          ' build
+          sed -i 's#-Dversion-string="$ZIG_VERSION"#-Dversion-string="$ZIG_VERSION" -Dmimalloc-obj="$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"#' build
       - run: cat build
       - name: Cache host toolchain
         uses: actions/cache@v4
diff --git a/lib/std/Thread/Condition.zig b/lib/std/Thread/Condition.zig
index 91974a44b4ab..e02c185822f8 100644
--- a/lib/std/Thread/Condition.zig
+++ b/lib/std/Thread/Condition.zig
@@ -107,10 +107,14 @@ pub fn broadcast(self: *Condition) void {
     self.impl.wake(.all);
 }
 
+// FutexImpl is used everywhere, including Windows. WindowsImpl wraps the
+// kernel CONDITION_VARIABLE which has no userspace "no waiters" fast-path —
+// every wake() is a kernel32 call. Under heavily-signalled condvars (e.g.
+// the compiler's per-job work_queue_cond.signal()) this dominates wall time
+// at high thread counts. FutexImpl checks `wakeable == 0` in userspace
+// first; on Windows the underlying Futex maps to RtlWaitOnAddress (Win8+).
 const Impl = if (builtin.single_threaded)
     SingleThreadedImpl
-else if (builtin.os.tag == .windows)
-    WindowsImpl
 else
     FutexImpl;
 
diff --git a/src/main.zig b/src/main.zig
index 854b6517d92e..8e713022e3d8 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -173,12 +173,21 @@ pub fn main() anyerror!void {
         if (build_options.debug_gpa) break :gpa .{ debug_allocator.allocator(), true };
         if (native_os == .wasi) break :gpa .{ std.heap.wasm_allocator, false };
         if (builtin.link_libc) {
-            // We would prefer to use raw libc allocator here, but cannot use
-            // it if it won't support the alignment we need.
-            if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) {
-                break :gpa .{ std.heap.c_allocator, false };
+            // libc malloc is fine single-threaded, but with ZIG_PARALLEL_SEMA
+            // and high --llvm-codegen-threads the per-process heap lock
+            // (musl's global rwlock, Windows CRT's HeapAlloc critical
+            // section) serialises every gpa allocation across all worker
+            // threads. Prefer the per-thread smp_allocator in release
+            // builds; it backs onto the page allocator so it's safe to mix
+            // with libc malloc used elsewhere (LLVM, C++). Debug keeps
+            // c_allocator so leak tooling and -Ddebug-gpa stay accurate.
+            if (builtin.mode == .Debug) {
+                if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) {
+                    break :gpa .{ std.heap.c_allocator, false };
+                }
+                break :gpa .{ std.heap.raw_c_allocator, false };
             }
-            break :gpa .{ std.heap.raw_c_allocator, false };
+            break :gpa .{ std.heap.smp_allocator, false };
         }
         break :gpa switch (builtin.mode) {
             .Debug => .{ debug_allocator.allocator(), true },
diff --git a/tools/mimalloc_new_delete_override.cpp b/tools/mimalloc_new_delete_override.cpp
new file mode 100644
index 000000000000..74b9be6d26c8
--- /dev/null
+++ b/tools/mimalloc_new_delete_override.cpp
@@ -0,0 +1,29 @@
+// mimalloc operator new/delete override for the zig compiler on Windows.
+//
+// LLVM emit at high --llvm-codegen-threads counts allocates heavily through
+// C++ operator new from N concurrent LLVM contexts. The Windows CRT routes
+// operator new -> malloc -> HeapAlloc(GetProcessHeap()), and the process heap
+// is guarded by a single critical section, so 24 threads serialise on it.
+//
+// On linux-musl the CI splices a MI_MALLOC_OVERRIDE static.c object so
+// malloc/free themselves are replaced (POSIX symbol interposition). That
+// doesn't work for static MSVC linking — the CRT's malloc is a strong symbol
+// — but C++ operator new/delete *are* replaceable per the standard, and
+// LLVM's hot allocations go through them. So compile mimalloc here and
+// provide global new/delete that forward to it; LLVM picks these up at link
+// time and zig's gpa is handled separately by smp_allocator.
+//
+// Compiled with:
+//   clang-cl /O2 /MT /std:c++17 /EHsc /DNDEBUG /DMI_STATIC_LIB
+//     -I <mimalloc>/include /c mimalloc-override.cpp
+// and passed to zig's build via -Dmimalloc-obj=<this>.obj.
+
+// mimalloc unity build — defines mi_malloc/mi_free/mi_new and friends.
+// Compiled as C++ (mimalloc supports this), *without* MI_MALLOC_OVERRIDE so
+// it doesn't try to redefine malloc/free — MSVC link would reject the
+// duplicate symbols against the static CRT.
+#include "src/static.c" // NOLINT
+
+// Replaceable global operator new/delete -> mimalloc.
+// Header is self-contained; only needs the mi_* symbols above.
+#include "mimalloc-new-delete.h"

From fc091008f9724ecbb2f8eeb2facd95f47ddec30f Mon Sep 17 00:00:00 2001
From: Dylan Conway <dylan.conway567@gmail.com>
Date: Wed, 22 Apr 2026 15:11:11 -0700
Subject: [PATCH 14/15] shard: key navShard on (file, fqn) instead of file
 alone
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-file partitioning meant a small source file that hosts thousands of
generic instantiations (e.g. output.zig's printf-style formatters) lands
entirely in one LLVM module, pinning emit wall-clock to that one shard.
For bun on a 24-core host, shard 13 took 22.5s while the rest finished
in 3-14s.

Hashing the FQN as well spreads instantiations across shards. Max shard
drops from 22.5s to ~14s; cross-shard externs grow (CPU sum +30%) but
wall-clock falls. bun debug zig step:

  file-only:   ~35s wall, 207s cpu-sum, max-shard 22.5s
  file+fqn:    ~27s wall, 274s cpu-sum, max-shard 13.9s

Determinism: anonymous-type FQNs embed InternPool indices which are
insertion-order dependent under parallel sema, so the shard set can vary
between runs. This is no regression — `shardedNavName` already embeds
the same indices in cross-shard symbol names, so sharded build-obj output
was never bit-reproducible under ZIG_PARALLEL_SEMA. cg=1 builds (CI
releases) are unaffected. A structural-hash naming fix is tracked
separately.
---
 src/Zcu.zig | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/Zcu.zig b/src/Zcu.zig
index 6bbc1af2f845..f4624fcef59f 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -4812,9 +4812,28 @@ pub fn navFileScope(zcu: *Zcu, nav: InternPool.Nav.Index) *File {
     return zcu.fileByIndex(zcu.navFileScopeIndex(nav));
 }
 
+/// Shard assignment for `nav`. Keyed on the file's `shardKey` *plus* the
+/// nav's fully-qualified name so a single file with thousands of generic
+/// instantiations (e.g. printf-style formatters that monomorphise per call
+/// site) doesn't pin the entire emit wall-clock to one LLVM module.
+///
+/// Determinism: the shard key is content-derived (path + FQN bytes), but
+/// FQNs of anonymous types embed InternPool indices (`__anon_N`) which are
+/// not stable across parallel-sema runs. That's no regression — the
+/// per-shard *symbol names* already carry those indices via `shardedNavName`
+/// and the type-name suffix, so sharded `build-obj` output was never
+/// bit-reproducible under `ZIG_PARALLEL_SEMA`. CI release builds use
+/// `--llvm-codegen-threads=1` (no sharding) and remain reproducible. A
+/// proper fix needs structural type-hash naming; tracked separately.
 pub fn navShard(zcu: *Zcu, nav: InternPool.Nav.Index, n: u32) u32 {
     if (n <= 1) return 0;
-    return zcu.navFileScope(nav).computeShard(n);
+    const ip = &zcu.intern_pool;
+    var buf: [512]u8 = undefined;
+    const file_key = zcu.navFileScope(nav).shardKey(&buf);
+    var h: std.hash.Wyhash = .init(0);
+    h.update(file_key);
+    h.update(ip.getNav(nav).fqn.toSlice(ip));
+    return @intCast(h.final() % n);
 }
 
 /// Returns the LLVM codegen shard that owns `unit`. Module-level assembly is

From 125866cc8dc5e8159ba54ecb11d856ca662544ce Mon Sep 17 00:00:00 2001
From: Dylan Conway <dylan.conway567@gmail.com>
Date: Wed, 22 Apr 2026 15:46:28 -0700
Subject: [PATCH 15/15] build: shard the build-runner compile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`zig build` compiles build_runner.zig + the user's build.zig before any
step runs. That compile pulled in ~10k navs of std.Build and emitted
them through a single LLVM module — ~3.7s of cold-cache wall before the
first user step starts. Pass llvm_codegen_threads (the same n_jobs the
thread pool was sized to) so the runner emit shards like any other
compile.

bun debug zig step, 24-core Windows, cold local cache:

  build-runner compile  3.84s -> 1.28s  (emit 2.96s -> 0.37s)
  total                 27s   -> 24s
---
 src/main.zig | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/main.zig b/src/main.zig
index 8e713022e3d8..7465d552c75f 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -5466,6 +5466,12 @@ fn cmdBuild(gpa: Allocator, arena: Allocator, args: []const []const u8) !void {
                 .cache_mode = .whole,
                 .reference_trace = reference_trace,
                 .debug_compile_errors = debug_compile_errors,
+                // The build runner pulls in a non-trivial chunk of std.Build;
+                // with single-threaded emit it costs several seconds on a
+                // cold local cache before any user step starts. Shard it the
+                // same as user compiles. `threads.len` is the resolved
+                // `n_jobs` (cpu-count-capped above).
+                .llvm_codegen_threads = @intCast(thread_pool.threads.len),
             }) catch |err| switch (err) {
                 error.CreateFail => fatal("failed to create compilation: {f}", .{create_diag}),
                 else => fatal("failed to create compilation: {s}", .{@errorName(err)}),