AMD-AGI · ajassani · May 8, 2025 · May 8, 2025
diff --git a/TraceLens/PerfModel/perf_model.py b/TraceLens/PerfModel/perf_model.py
@@ -104,8 +104,7 @@ def __init__(self, event, arch=None, detail_level=0):
                 if not os.path.exists(os.environ.get('GEMMOLOGIST_PATH')):
                     raise ValueError(f"GEMMOLOGIST_PATH does not exist: {os.environ.get('GEMMOLOGIST_PATH')}")
                 dtype = torch_dtype_map(self.param_details['dtype_A_B'][0])
-                gemmologist_time = self.get_gemmologist_time(arch, self.M, self.N, self.K, dtype)
-                self.gemmologist_time = gemmologist_time
+                self.gemmologist_time, self.gemmologist_cmd = GEMM.get_gemmologist_time(arch, self.M, self.N, self.K, dtype)
             else:
                 # TODO: use naive roofline model
                 pass
@@ -161,32 +160,35 @@ def bytes_bwd(self, bytes_per_element):
 
     @staticmethod
     def get_gemmologist_time(arch, M, N, K, dtype):
-        # Create a unique key for the cache based on dimensions and architecture
-        cache_key = (arch['name'], arch['freq_mhz'], M, N, K, dtype)
-        # Check if the result is already in the cache
-        if cache_key in GEMM.cache_gemm_results:
-            return GEMM.cache_gemm_results[cache_key]
         # assume that gemmologist path is given in the environment variable GEMMOLOGIST_PATH
         gemmologist_path = os.environ.get('GEMMOLOGIST_PATH')
         cmd = [
             "./bin/gemmologist.py",
             "-m", str(M),
             "-n", str(N),
             "-k", str(K),
+            "--dtype", dtype,
             "-d", "1",
             "-a", arch["name"],
             "--freq_mhz", str(arch["freq_mhz"]),
             "--topn", "1"
         ]
+
+        # Check if the result is already in the cache
+        cache_key = tuple(cmd)
+        if cache_key in GEMM.cache_gemm_results:
+            return GEMM.cache_gemm_results[cache_key], " ".join(cmd)
+
+        # Run the command
         result = subprocess.run(cmd, cwd=gemmologist_path, capture_output=True, text=True)
         stdout = result.stdout
         stderr = result.stderr
         log = re.findall(r"Time=\d+\.\d+", stdout)
         if len(log) > 0:
             gemmologist_time = float(re.sub("Time=", "", str(log[0])))
-            # Store the result in the cache
+            # Cache the result
             GEMM.cache_gemm_results[cache_key] = gemmologist_time
-            return gemmologist_time
+            return gemmologist_time, " ".join(cmd)
         else:
             raise AssertionError("Not able to simulate in gemmologist", cmd, stdout, stderr)
 

diff --git a/TraceLens/TreePerf/tree_perf.py b/TraceLens/TreePerf/tree_perf.py
@@ -149,6 +149,7 @@ def compute_perf_metrics(self, event, bwd=False,
         if hasattr(perf_model, "gemmologist_time"):
             dict_metrics['Gemmologist Time (µs)'] = perf_model.gemmologist_time
             dict_metrics['Gemmologist TFLOPS/s'] = (gflops / 1e3) / (perf_model.gemmologist_time / 1e6) if perf_model.gemmologist_time > 0 else float('nan')
+            dict_metrics['Gemmologist cmd'] = perf_model.gemmologist_cmd
 
         for key, value in perf_model.param_details.items():
             dict_metrics[f"param: {key}"] = value
@@ -239,6 +240,7 @@ def summarize_df_perf_metrics(df_perf_metrics, agg_metrics=['mean', 'std']):
             # first since it should be same for the group
             dict_agg['Gemmologist TFLOPS/s'] = 'first'
             dict_agg['Gemmologist Time (µs)'] = 'first'
+            dict_agg['Gemmologist cmd'] = 'first'
         if 'Non-Data-Mov TFLOPS/s' in df_perf_metrics.columns:
             dict_agg['Non-Data-Mov TFLOPS/s'] = agg_metrics
         if 'Non-Data-Mov Kernel Time (µs)' in df_perf_metrics.columns: