perf: customize cublastLt algo for Llamba 3.3 70B TP4 (#6315)

zhenhuaw-me · web-flow · commit 62298bc4730b · 2025-07-24T23:01:15.000+08:00
Signed-off-by: Zhenhua Wang &lt;zhenhuaw@nvidia.com&gt;
diff --git a/.clangd b/.clangd
@@ -29,7 +29,7 @@ CompileFlags:
 # Tweak the clangd parse settings for all files
 CompileFlags:
   Compiler: clang++
-  CompilationDatabase: .
+  CompilationDatabase: cpp/build
   Add:
     # report all errors
     - "-ferror-limit=0"
diff --git a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp
@@ -66,6 +66,9 @@ AlgoListType fp8_algo_list = {
     {{8, 8192, 8192}, {393, 36, 1, 0, 0, 5, 2}},
     // [-algo66 -m_tile10 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga2 -m_scheduling1]
     {{8, 8192, 57344}, {10, 36, 1, 0, 0, 1, 2}},
+    // Llama-3.3-70B TP4 (this is the default algo on B200. Here we aim to use the same algo on GB200.)
+    // [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga4 -m_scheduling1]
+    {{8, 8192, 14336}, {393, 36, 1, 0, 1, 1, 4}},
 };
 
 void set_algo_attr(cublasLtMatmulAlgo_t& algo, std::array<int, 7> const& attr_list)