@@ -132,6 +132,7 @@ proc subgroupProc[A, B, C](wg: WorkGroupContext; numActiveThreads: uint32; barri
132
132
let globalOffsetX = wg.gl_WorkGroupID.x * wg.gl_WorkGroupSize.x
133
133
let globalOffsetY = wg.gl_WorkGroupID.y * wg.gl_WorkGroupSize.y
134
134
let globalOffsetZ = wg.gl_WorkGroupID.z * wg.gl_WorkGroupSize.z
135
+ # Setup thread contexts
135
136
for threadId in 0..<numActiveThreads:
136
137
threadContexts[threadId] = ThreadContext(
137
138
gl_LocalInvocationID: uvec3(x, y, z),
@@ -142,7 +143,6 @@ proc subgroupProc[A, B, C](wg: WorkGroupContext; numActiveThreads: uint32; barri
142
143
),
143
144
gl_SubgroupInvocationID: threadId
144
145
)
145
- threads[threadId] = compute(buffers, shared, args)
146
146
# Update coordinates
147
147
inc x
148
148
if x >= wg.gl_WorkGroupSize.x:
@@ -151,6 +151,9 @@ proc subgroupProc[A, B, C](wg: WorkGroupContext; numActiveThreads: uint32; barri
151
151
if y >= wg.gl_WorkGroupSize.y:
152
152
y = 0
153
153
inc z
154
+ # Allocate all compute closures
155
+ for threadId in 0..<numActiveThreads:
156
+ threads[threadId] = compute(buffers, shared, args)
154
157
# Run threads in lockstep
155
158
runThreads(threads, wg, threadContexts, numActiveThreads, barrier)
156
159
0 commit comments