Skip to content

Commit 6fda8dd

Browse files
[None][infra] Cherry-pick #6836 from main branch and improve SSH connection (#6971)
Signed-off-by: Yanchao Lu <[email protected]> Co-authored-by: Zhanrui Sun <[email protected]>
1 parent 28c30e1 commit 6fda8dd

File tree

3 files changed

+38
-22
lines changed

3 files changed

+38
-22
lines changed

cpp/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,17 @@ if(ENABLE_UCX)
495495
if(NOT ${ucx_FOUND})
496496
set(ENABLE_UCX 0)
497497
else()
498+
if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "")
499+
if(EXISTS "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake")
500+
file(READ "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" FILE_CONTENTS)
501+
string(
502+
REPLACE "https://raw.githubusercontent.com/rapidsai/rapids-cmake"
503+
"$ENV{GITHUB_MIRROR}/rapidsai/rapids-cmake/raw/refs/heads"
504+
FILE_CONTENTS "${FILE_CONTENTS}")
505+
file(WRITE "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" "${FILE_CONTENTS}")
506+
message(WARNING "Replace UCXX fetch_rapids.cmake with internal mirror")
507+
endif()
508+
endif()
498509
# installing ucxx via add_subdirectory results in strange cudart linking
499510
# error, thus using their installation script to isolate the installation
500511
# process until the issue is understood. And always trigger the build so

jenkins/BuildDockerImage.groovy

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def buildImage(config, imageKeyToTag)
258258
// Step 2: Build the images
259259
stage ("Install packages") {
260260
sh "pwd && ls -alh"
261-
sh "env"
261+
sh "env | sort"
262262
sh "apk add make git"
263263
sh "git config --global --add safe.directory '*'"
264264

@@ -281,12 +281,12 @@ def buildImage(config, imageKeyToTag)
281281
try {
282282
def build_jobs = BUILD_JOBS
283283
// Fix the triton image pull timeout issue
284-
def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
285-
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
286-
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
284+
def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
285+
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
286+
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
287287

288288
if (target == "rockylinux8") {
289-
BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
289+
BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
290290
}
291291

292292
// Replace the base image and triton image with the internal mirror
@@ -295,7 +295,8 @@ def buildImage(config, imageKeyToTag)
295295

296296
if (dependent) {
297297
stage ("make ${dependent.target}_${action} (${arch})") {
298-
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
298+
def randomSleep = (Math.random() * 300 + 300).toInteger()
299+
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200)
299300
trtllm_utils.llmExecStepWithRetry(this, script: """
300301
cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
301302
BASE_IMAGE=${BASE_IMAGE} \
@@ -304,7 +305,7 @@ def buildImage(config, imageKeyToTag)
304305
IMAGE_WITH_TAG=${dependentImageWithTag} \
305306
STAGE=${dependent.dockerfileStage} \
306307
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
307-
""", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
308+
""", sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200)
308309
args += " DEVEL_IMAGE=${dependentImageWithTag}"
309310
if (target == "ngc-release") {
310311
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
@@ -322,7 +323,9 @@ def buildImage(config, imageKeyToTag)
322323
}
323324
}
324325
stage ("make ${target}_${action} (${arch})") {
325-
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
326+
sh "env | sort"
327+
def randomSleep = (Math.random() * 300 + 300).toInteger()
328+
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200)
326329
trtllm_utils.llmExecStepWithRetry(this, script: """
327330
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
328331
BASE_IMAGE=${BASE_IMAGE} \
@@ -331,7 +334,7 @@ def buildImage(config, imageKeyToTag)
331334
IMAGE_WITH_TAG=${imageWithTag} \
332335
STAGE=${dockerfileStage} \
333336
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
334-
""", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
337+
""", sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200)
335338
if (target == "ngc-release") {
336339
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
337340
}

jenkins/L0_Test.groovy

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
9999
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
100100
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
101101

102+
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
103+
102104
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
103105
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
104106
def remote = [
@@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
113115
pipeline.stage('Submit Test Results') {
114116
sh "mkdir -p ${stageName}"
115117
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
116-
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
118+
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
117119
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
118120
if (downloadSucceed) {
119121
sh "ls ${stageName}"
@@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
239241

240242
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
241243

242-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
244+
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
243245

244246
Utils.exec(
245247
pipeline,
@@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
327329

328330
stage('Prepare Testing') {
329331
// Create Job Workspace folder in Frontend Node
330-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
332+
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)
331333

332334
// Download and Unzip Tar File
333335
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
@@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
336338
// Upload slurm_run_sh to Frontend node
337339
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
338340
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
339-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
341+
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
340342

341343
// Upload waives.txt to Frontend node
342344
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
343-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
345+
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
344346

345347
// Generate Test List and Upload to Frontend Node
346348
def makoArgs = getMakoArgsFromStageName(stageName, true)
@@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
349351
// if the line cannot be split by "=", just ignore that line.
350352
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
351353
def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
352-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
354+
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
353355

354356
// Generate Multi Node Job Launch Script
355357
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
@@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
393395
""".stripIndent()
394396
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
395397
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
396-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
398+
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
397399
}
398400
stage('Run Test') {
399401
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
@@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName)
10891091
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
10901092
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
10911093
]) {
1092-
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true)
1094+
portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true)
10931095
}
10941096
echo "Port Usage: ${portUsage}"
10951097

@@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
12481250
def llmRootConfig = "${LLM_ROOT}${config}"
12491251
sh "mkdir ${llmRootConfig}"
12501252

1251-
def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim()
1253+
def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
12521254
def llmSrc = "${llmPath}/TensorRT-LLM/src"
12531255
echoNodeAndGpuInfo(pipeline, stageName)
12541256

@@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
13621364
usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
13631365
string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
13641366
]) {
1365-
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
1366-
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
1367-
sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
1367+
sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
1368+
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
1369+
sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
13681370

13691371
def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName)
13701372
if (userPort == 0) {
@@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
13731375
return
13741376
}
13751377

1376-
sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
1378+
sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
13771379
sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
13781380
sh "ps aux | grep ssh"
13791381
try {

0 commit comments

Comments
 (0)