Skip to content

Commit 9f4651f

Browse files
authored
Add support for AMD AINIC within RCCL default internal network plugin. (#2078)
* Added support for AMD ROCm net-ib alongside vanilla net-ib, with auto-generation to detect conflicts early during NCCL sync and enable future customizations. * Integrated AMD AINIC support in RCCL for out-of-the-box usage, leveraging performance improvements by default, channel pinning for optimal pipeline performance, and extended support for 32B in-line CTS messages. * Implemented internal derivation of AINIC-specific flags when RCCL AINIC environment parameter is set, and checks before initializing AINIC net-ib methods. * Included snapshot of auto-generated ROCm net-ib file (src/transport/net_ib_rocm.cc) for reference. * Fixed typos in RCCL param API (RCCL_AINIC_ROCE) and dlclose. * Updated plugin loading logic: * Load internal ROCmIB plugin only when NCCL_NET_PLUGIN is not set. * Load default internal net-ib only when not AINIC and no external plugin env is set.
1 parent 4f474a7 commit 9f4651f

File tree

12 files changed

+4262
-12
lines changed

12 files changed

+4262
-12
lines changed

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ endif()
8080
# Determine which GPU architectures to build for
8181
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
8282

83+
# ROCM NetIB patch
84+
include(cmake/rocmIb.cmake)
85+
8386
# Modify GPU architectures for Address Sanitizer builds by appending "xnack+"
8487
if (BUILD_ADDRESS_SANITIZER)
8588
SET(amdgpu_targets "")
@@ -571,6 +574,9 @@ set(SRC_FILES
571574
src/include/mlx5/mlx5dvcore.h
572575
src/include/mlx5/mlx5dvsymbols.h
573576
src/include/mlx5/mlx5dvwrap.h
577+
src/include/ionic/ionicdvcore.h
578+
src/include/ionic/ionicdvsymbols.h
579+
src/include/ionic/ionicdvwrap.h
574580
src/include/msccl/msccl_lifecycle.h
575581
src/include/msccl/msccl_parser.h
576582
src/include/msccl/msccl_scheduler.h
@@ -647,6 +653,8 @@ set(SRC_FILES
647653
src/misc/ipcsocket.cc
648654
src/misc/mlx5dvsymbols.cc
649655
src/misc/mlx5dvwrap.cc
656+
src/misc/ionicdvsymbols.cc
657+
src/misc/ionicdvwrap.cc
650658
src/misc/npkit.cc
651659
# src/misc/nvmlwrap.cc
652660
src/misc/nvmlwrap_stub.cc
@@ -695,6 +703,7 @@ set(SRC_FILES
695703
src/transport/generic.cc
696704
src/transport/net.cc
697705
src/transport/net_ib.cc
706+
src/transport/net_ib_rocm.cc
698707
src/transport/net_socket.cc
699708
src/transport/nvls.cc
700709
src/transport/p2p.cc
@@ -862,6 +871,7 @@ target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
862871
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
863872
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
864873
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
874+
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
865875
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
866876
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
867877
target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})

cmake/rocmIb.cmake

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# MIT License
2+
#
3+
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
# Dependencies
24+
25+
# HIP dependency is handled earlier in the project cmake file
26+
# when VerifyCompiler.cmake is included.
27+
28+
# GIT
29+
30+
# Test dependencies
31+
32+
# For downloading, building, and installing required dependencies
33+
include(cmake/DownloadProject.cmake)
34+
35+
message(STATUS "Generating ROCM NetIB... ")
36+
37+
# -------------------------
38+
# Configurable paths
39+
# -------------------------
40+
# Path to RCCL source tree (local clone)
41+
set(RCCL_SRC_DIR "${CMAKE_SOURCE_DIR}" CACHE PATH "Path to RCCL source directory")
42+
# Path to patch file
43+
set(ROCM_NETIB_PATCH_FILE "${CMAKE_SOURCE_DIR}/ext-src/rocm_netib.patch" CACHE FILEPATH "ROCM NETIB Patch file to apply to RCCL")
44+
set(ROCM_NETIB_FILE "${CMAKE_SOURCE_DIR}/src/transport/net_ib_rocm.cc" CACHE FILEPATH "Generated ROCM NETIB file")
45+
46+
# -------------------------
47+
# Find tools
48+
# -------------------------
49+
find_program(PATCH_EXECUTABLE patch)
50+
find_program(SED_EXECUTABLE sed)
51+
52+
execute_process(
53+
COMMAND ${CMAKE_COMMAND} -E echo "Applying RCCL ROCM NetIB patch... to ${CMAKE_SOURCE_DIR}"
54+
COMMAND bash -c "patch -p1 -i ${ROCM_NETIB_PATCH_FILE} -o ${ROCM_NETIB_FILE}"
55+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
56+
)
57+
execute_process(
58+
COMMAND bash -c "sed -i 's/NCCL_PARAM(Ib/NCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
59+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
60+
)
61+
execute_process(
62+
COMMAND bash -c "sed -i 's/RCCL_PARAM(Ib/RCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
63+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
64+
)
65+
execute_process(
66+
COMMAND bash -c "sed -i 's/ncclParamIb/ncclParamRocmIb/g' ${ROCM_NETIB_FILE}"
67+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
68+
)
69+
execute_process(
70+
COMMAND bash -c "sed -i 's/rcclParamIb/rcclParamRocmIb/g' ${ROCM_NETIB_FILE}"
71+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
72+
)
73+
execute_process(
74+
COMMAND bash -c "sed -i 's/ncclIbMergedDevs/rocmIbMergedDevs/g' ${ROCM_NETIB_FILE}"
75+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
76+
)
77+
execute_process(
78+
COMMAND bash -c "sed -i 's/ncclIbDevs/rocmIbDevs/g' ${ROCM_NETIB_FILE}"
79+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
80+
)
81+
execute_process(
82+
COMMAND bash -c "sed -i 's/ncclIbLock/rocmIbLock/g' ${ROCM_NETIB_FILE}"
83+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
84+
)
85+
execute_process(
86+
COMMAND bash -c "sed -i 's/ibProviderName/rocmIbProviderName/g' ${ROCM_NETIB_FILE}"
87+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
88+
)
89+
execute_process(
90+
COMMAND bash -c "sed -i 's/ncclIbAsyncThread/rocmIbAsyncThread/g' ${ROCM_NETIB_FILE}"
91+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
92+
)
93+
execute_process(
94+
COMMAND bash -c "sed -i 's/ncclIbGdrSupport/rocmIbGdrSupport/g' ${ROCM_NETIB_FILE}"
95+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
96+
)
97+
execute_process(
98+
COMMAND bash -c "sed -i 's/ncclIbDmaBufSupport/rocmIbDmaBufSupport/g' ${ROCM_NETIB_FILE}"
99+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
100+
)
101+
execute_process(
102+
COMMAND bash -c "sed -i 's/ncclIbInitCommDevBase/rocmIbInitCommDevBase/g' ${ROCM_NETIB_FILE}"
103+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
104+
)
105+
execute_process(
106+
COMMAND bash -c "sed -i 's/ncclIbDestroyBase/rocmIbDestroyBase/g' ${ROCM_NETIB_FILE}"
107+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
108+
)
109+
execute_process(
110+
COMMAND bash -c "sed -i 's/ncclIbRtrQp/rocmIbRtrQp/g' ${ROCM_NETIB_FILE}"
111+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
112+
)
113+
execute_process(
114+
COMMAND bash -c "sed -i 's/ncclIbRtsQp/rocmIbRtsQp/g' ${ROCM_NETIB_FILE}"
115+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
116+
)
117+
execute_process(
118+
COMMAND bash -c "sed -i 's/ForceEnableGdrdma/RocmForceEnableGdrdma/g' ${ROCM_NETIB_FILE}"
119+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
120+
)
121+
execute_process(
122+
COMMAND bash -c "sed -i 's/ncclIbCheckVProps/rocmIbCheckVProps/g' ${ROCM_NETIB_FILE}"
123+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
124+
)
125+
execute_process(
126+
COMMAND bash -c "sed -i 's/ncclIbGetRequest/rocmIbGetRequest/g' ${ROCM_NETIB_FILE}"
127+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
128+
)
129+
execute_process(
130+
COMMAND bash -c "sed -i 's/ncclIbFreeRequest/rocmIbFreeRequest/g' ${ROCM_NETIB_FILE}"
131+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
132+
)
133+
execute_process(
134+
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBufInternal/rocmIbRegMrDmaBufInternal/g' ${ROCM_NETIB_FILE}"
135+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
136+
)
137+
execute_process(
138+
COMMAND bash -c "sed -i 's/ncclIbGetNetCommDevBase/rocmIbGetNetCommDevBase/g' ${ROCM_NETIB_FILE}"
139+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
140+
)
141+
execute_process(
142+
COMMAND bash -c "sed -i 's/ncclIbDeregMrInternal/rocmIbDeregMrInternal/g' ${ROCM_NETIB_FILE}"
143+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
144+
)
145+
execute_process(
146+
COMMAND bash -c "sed -i 's/ncclIbPostFifo/rocmIbPostFifo/g' ${ROCM_NETIB_FILE}"
147+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
148+
)
149+
execute_process(
150+
COMMAND bash -c "sed -i 's/reqTypeStr/rocmIbReqTypeStr/g' ${ROCM_NETIB_FILE}"
151+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
152+
)
153+
execute_process(
154+
COMMAND bash -c "sed -i 's/rcclNetP2pPolicy/rcclRocmNetP2pPolicy/g' ${ROCM_NETIB_FILE}"
155+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
156+
)
157+
execute_process(
158+
COMMAND bash -c "sed -i 's/ncclIbMakeVDeviceInternal/rocmIbMakeVDeviceInternal/g' ${ROCM_NETIB_FILE}"
159+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
160+
)
161+
execute_process(
162+
COMMAND bash -c "sed -i 's/ncclIbMakeVDevice/rocmIbMakeVDevice/g' ${ROCM_NETIB_FILE}"
163+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
164+
)
165+
execute_process(
166+
COMMAND bash -c "sed -i 's/ncclIbInit/rocmIbInit/g' ${ROCM_NETIB_FILE}"
167+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
168+
)
169+
execute_process(
170+
COMMAND bash -c "sed -i 's/ncclIbDevices/rocmIbDevices/g' ${ROCM_NETIB_FILE}"
171+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
172+
)
173+
execute_process(
174+
COMMAND bash -c "sed -i 's/ncclIbGetPhysProperties/rocmIbGetPhysProperties/g' ${ROCM_NETIB_FILE}"
175+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
176+
)
177+
execute_process(
178+
COMMAND bash -c "sed -i 's/ncclIbGetProperties/rocmIbGetProperties/g' ${ROCM_NETIB_FILE}"
179+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
180+
)
181+
execute_process(
182+
COMMAND bash -c "sed -i 's/ncclIbListen\(/rocmIbListen\(/g' ${ROCM_NETIB_FILE}"
183+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
184+
)
185+
execute_process(
186+
COMMAND bash -c "sed -i 's/ncclIbListen,/rocmIbListen,/g' ${ROCM_NETIB_FILE}"
187+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
188+
)
189+
execute_process(
190+
COMMAND bash -c "sed -i 's/ncclIbConnect\(/rocmIbConnect\(/g' ${ROCM_NETIB_FILE}"
191+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
192+
)
193+
execute_process(
194+
COMMAND bash -c "sed -i 's/ncclIbConnect /rocmIbConnect /g' ${ROCM_NETIB_FILE}"
195+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
196+
)
197+
execute_process(
198+
COMMAND bash -c "sed -i 's/ncclIbConnect,/rocmIbConnect,/g' ${ROCM_NETIB_FILE}"
199+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
200+
)
201+
execute_process(
202+
COMMAND bash -c "sed -i 's/ncclIbAccept/rocmIbAccept/g' ${ROCM_NETIB_FILE}"
203+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
204+
)
205+
execute_process(
206+
COMMAND bash -c "sed -i 's/ncclIbTest/rocmIbTest/g' ${ROCM_NETIB_FILE}"
207+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
208+
)
209+
execute_process(
210+
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBuf/rocmIbRegMrDmaBuf/g' ${ROCM_NETIB_FILE}"
211+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
212+
)
213+
execute_process(
214+
COMMAND bash -c "sed -i 's/ncclIbRegMr/rocmIbRegMr/g' ${ROCM_NETIB_FILE}"
215+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
216+
)
217+
execute_process(
218+
COMMAND bash -c "sed -i 's/ncclIbDeregMr/rocmIbDeregMr/g' ${ROCM_NETIB_FILE}"
219+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
220+
)
221+
execute_process(
222+
COMMAND bash -c "sed -i 's/ncclIbIsend/rocmIbIsend/g' ${ROCM_NETIB_FILE}"
223+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
224+
)
225+
execute_process(
226+
COMMAND bash -c "sed -i 's/ncclIbIrecv/rocmIbIrecv/g' ${ROCM_NETIB_FILE}"
227+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
228+
)
229+
execute_process(
230+
COMMAND bash -c "sed -i 's/ncclIbIflush/rocmIbIflush/g' ${ROCM_NETIB_FILE}"
231+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
232+
)
233+
execute_process(
234+
COMMAND bash -c "sed -i 's/ncclIbCloseSend/rocmIbCloseSend/g' ${ROCM_NETIB_FILE}"
235+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
236+
)
237+
execute_process(
238+
COMMAND bash -c "sed -i 's/ncclIbCloseRecv/rocmIbCloseRecv/g' ${ROCM_NETIB_FILE}"
239+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
240+
)
241+
execute_process(
242+
COMMAND bash -c "sed -i 's/ncclIbCloseListen/rocmIbCloseListen/g' ${ROCM_NETIB_FILE}"
243+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
244+
)
245+
execute_process(
246+
COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
247+
WORKING_DIRECTORY ${RCCL_SRC_DIR}
248+
)
249+
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

0 commit comments

Comments
 (0)