@@ -372,8 +372,11 @@ template <typename T>
372
372
float CutlassInt8GemmRunner<T>::profileConfig(const tkc::CutlassGemmConfig& config, tk::QuantOption quantOption, int m,
373
373
int n, int k, int8_t * A, int8_t * B, void * C, float * alphaCol, float * alphaRow, char * workspace)
374
374
{
375
- constexpr int warmup = 5 ;
376
- constexpr int runs = 15 ;
375
+ // reduce run times to reduce shared memory
376
+ // try to make warmup:run = 1:3
377
+ // different GPU has difference times
378
+ constexpr int warmup = 3 ;
379
+ constexpr int runs = 10 ;
377
380
378
381
const auto workspaceBytes = getWorkspaceSize (m, n, k);
379
382
@@ -423,19 +426,32 @@ tkc::CutlassGemmConfig CutlassInt8GemmRunner<T>::profileGemm(tk::QuantOption qua
423
426
424
427
float bestTime = std::numeric_limits<float >::max ();
425
428
tkc::CutlassGemmConfig bestConfig;
426
-
429
+ float time = bestTime;
430
+ bool is_ok = false ;
427
431
for (int ii = 0 ; ii < candidateConfigs.size (); ++ii)
428
432
{
429
433
tkc::CutlassGemmConfig candidateConfig = candidateConfigs[ii];
430
- const float time = profileConfig (candidateConfig, quantOption, m, n, k, A, B, C, alphaCol, alphaRow, workspace);
431
- if (time < bestTime)
432
- {
434
+
435
+ try {
436
+ time = profileConfig (candidateConfig, quantOption, m, n, k, A, B, C, alphaCol, alphaRow, workspace);
437
+ is_ok = true ;
438
+ } catch (...) {
439
+ std::ostringstream msg;
440
+ msg << " it seem init failed, because has no enough shared memory." ;
441
+ TLLM_LOG_DEBUG (msg.str ());
442
+ }
443
+ if (time < bestTime) {
433
444
bestConfig = candidateConfig;
434
445
bestTime = time;
435
446
}
436
447
}
437
-
438
- return bestConfig;
448
+ if (is_ok) {
449
+ return bestConfig;
450
+ } else {
451
+ std::ostringstream msg;
452
+ msg << " it seem can't found any good config." ;
453
+ TLLM_LOG_ERROR (msg.str ());
454
+ }
439
455
}
440
456
441
457
template <typename T>
0 commit comments