AMD-AGI · ajassani · May 2, 2025 · May 3, 2025 · May 21, 2025 · May 21, 2025
diff --git a/TraceLens/NcclAnalyser/nccl_analyser.py b/TraceLens/NcclAnalyser/nccl_analyser.py
@@ -12,6 +12,7 @@
 import gzip
 import logging
 
+from concurrent.futures import ProcessPoolExecutor
 from ..util import DataLoader
 
 
@@ -21,6 +22,26 @@ def list_to_tuple(obj):
     return obj
 
 
+def nccl_filter_event_fn(event):
+    """Filters NCCL kernel events."""
+    is_nccl_kernel = (
+        event.get("cat") == "kernel" and "nccl" in event.get("name", "").lower()
+    )
+    is_linked = event.get("args", {}).get("External id") is not None
+    return is_nccl_kernel and is_linked
+
+
+def load_single_trace_fn(args):
+    """Worker function to load a single trace file."""
+    rank, filepath = args
+    raw_data = DataLoader.load_data(filepath)
+
+    nccl_events = [e for e in raw_data["traceEvents"] if nccl_filter_event_fn(e)]
+
+    rank_dict = {idx: evt for idx, evt in enumerate(nccl_events)}
+    return rank, rank_dict
+
+
 class NcclAnalyser:
     def __init__(self, list_profile_filepaths, world_size):
         self.logger = logging.getLogger(__name__)
@@ -74,20 +95,12 @@ def __init__(self, list_profile_filepaths, world_size):
         }
         self.implicit_sync_cat = {"allreduce", "reducescatter", "allgather", "alltoall"}
         # Filter function: keep only kernel events with "nccl" in the name
-        self.filter_event_fn = self._nccl_filter_event_fn
+        self.filter_event_fn = nccl_filter_event_fn
 
         # Internal storage
         self.rank2trace_data = {}  # Stores per-rank data
         self.load_trace_data()
 
-    def _nccl_filter_event_fn(self, event):
-        """Filters NCCL kernel events."""
-        is_nccl_kernel = (
-            event.get("cat") == "kernel" and "nccl" in event.get("name", "").lower()
-        )
-        is_linked = event.get("args", {}).get("External id") is not None
-        return is_nccl_kernel and is_linked
-
     def load_trace_data(self):
         """Loads NCCL JSON trace data and extracts relevant events."""
         self.logger.warning(
@@ -97,16 +110,25 @@ def load_trace_data(self):
             "Also note that we need all ranks for the analysis. We will add a fallback soon for lesser features for single rank or partial data."
         )
         self.rank2trace_data.clear()
-        for rank, filepath in enumerate(self.list_profile_filepaths):
-            self.logger.info(f"Loading rank {rank} from {filepath}")
-            raw_data = DataLoader.load_data(filepath)
+        # Prepare arguments for parallel processing
+        process_args = [
+            (rank, filepath)
+            for rank, filepath in enumerate(self.list_profile_filepaths)
+        ]
 
-            nccl_events = [
-                e for e in raw_data["traceEvents"] if self._nccl_filter_event_fn(e)
-            ]
+        # Determine number of workers (limit to avoid overwhelming system)
+        max_workers = min(len(self.list_profile_filepaths), 8)
+
+        # Load traces in parallel
+        self.logger.info(
+            f"Loading {len(self.list_profile_filepaths)} traces in parallel with {max_workers} workers"
+        )
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            results = list(executor.map(load_single_trace_fn, process_args))
 
-            # Build a dictionary with event data
-            rank_dict = {idx: evt for idx, evt in enumerate(nccl_events)}
+        # Store results
+        for rank, rank_dict in results:
+            self.logger.info(f"Loaded rank {rank} with {len(rank_dict)} NCCL events")
             self.rank2trace_data[rank] = rank_dict
 
     # ------------------------------------------------------------------------

diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,19 @@
+# Examples
+
+Collection of example workflows when using TraceLens:
+
+- [compare_kernel_launchers.ipynb](./compare_kernel_launchers.ipynb)
+- [event_replayer_example.ipynb](./event_replayer_example.ipynb)
+- [gemm_dim_eff.ipynb](./gemm_dim_eff.ipynb)
+- [generate_perf_report.py](./generate_perf_report.py)
+- [nccl_analyzer_example.ipynb](./nccl_analyser_example.ipynb)
+- [nn_module_view.ipynb](./nn_module_view.ipynb)
+- [roofline_plots_example.ipynb](./roofline_plots_example.ipynb)
+- [trace_fusion_example.py](./trace_fusion_example.py)
+- [tree_perf_example.ipynb](./tree_perf_example.ipynb)
+
+Collection of custom workflows when using TraceLens:
+
+- [fusion_opportunity.ipynb](./custom_workflows/fusion_opportunity.ipynb)
+- [generate_perf_report.py](./custom_workflows/generate_perf_report.py)
+- [node_replay.py](./custom_workflows/node_replay.py)
diff --git a/examples/custom_workflows/generate_perf_report.md b/examples/custom_workflows/generate_perf_report.md
@@ -0,0 +1,99 @@
+# Enhanced Performance Report Generator
+
+[generate_perf_report.py](./generate_perf_report.py) script produces a performance report in Excel format from traces produced by torch profiler. Works for both single- and multi-GPU traces.
+
+Traces are located recursively and by default, the outputs are saved in the same directory where the traces are located.
+
+## Arguments
+
+```
+Flag    Type        Required    Default      Description
+-b      str         True                     Path to base directory which contains profiling experiments as subdirectories
+-p      str         False       "rank_"      Pattern to use for finding the rank of a trace from filename. Supports <string><sep> where separator can be empty, - or _
+-e      str         False       "json"       Extension to use for identifying trace files. json and gz are supported
+-f      list<str>   False       ["rank_0"]   Select files containing given substring(s) in their full filepaths
+-r      bool        False       False        Run node replay for GEMMs and CONVs that contribute 99pct to group-specific execution time
+-d      bool        False       False        Dry run for checking if correct trace paths are found
+-a      bool        False       False        Save all individual kernels from all ranks (sheets kernels_0 ... kernels_n). Produces a lot of data
+-o      str         False       None         Filepath to save the Excel performance report. Note that this works only with a single base/parent directory containing one set of traces
+```
+
+## How to use
+
+Following directory tree is an example from a 8-GPU distributed inference setup where traces have been produced for 4 different configurations.
+
+```
+profiles
+    013_profile_544p_bs1
+        traces_rank_0_step_8.json
+        traces_rank_1_step_8.json
+        ...
+        traces_rank_7_step_8.json
+    013_profile_720p_bs1
+        traces_rank_0_step_8.json
+        traces_rank_1_step_8.json
+        ...
+        traces_rank_7_step_8.json
+    014_profile_544p_bs1
+        traces_rank_0_step_8.json
+        traces_rank_1_step_8.json
+        ...
+        traces_rank_7_step_8.json
+    014_profile_720p_bs1
+        traces_rank_0_step_8.json
+        traces_rank_1_step_8.json
+        ...
+        traces_rank_7_step_8.json
+```
+
+Generate performance reports for all configurations in one go using traces from all ranks:
+
+```
+python <path to generate_perf_report.py> -b results -f step_8
+```
+
+Then following files will be generated:
+
+```
+profiles/013_profile_544p_bs1/013_profile_544p_bs1_step_8_performance_report.xlsx
+profiles/013_profile_720p_bs1/013_profile_720p_bs1_step_8_performance_report.xlsx
+profiles/014_profile_544p_bs1/014_profile_544p_bs1_step_8_performance_report.xlsx
+profiles/014_profile_720p_bs1/014_profile_720p_bs1_step_8_performance_report.xlsx
+```
+
+Generate performance report for one configuration using traces from all ranks:
+
+```
+# User-defined excel paths can be passed via -o flag
+
+# Narrow down search using base path
+python <path to generate_perf_report.py> -b results/013_profile_544p_bs1 -f step_8
+```
+
+Generate performance reports for single ranks:
+
+```
+python <path to generate_perf_report.py> -b results -f rank_0 step_8
+```
+
+Then following files will be generated:
+
+```
+profiles/013_profile_544p_bs1/013_profile_544p_bs1_rank_0_step_8_performance_report.xlsx
+profiles/013_profile_720p_bs1/013_profile_720p_bs1_rank_0_step_8_performance_report.xlsx
+profiles/013_profile_544p_bs1/013_profile_544p_bs1_rank_0_step_8_performance_report.xlsx
+profiles/013_profile_720p_bs1/013_profile_720p_bs1_rank_0_step_8_performance_report.xlsx
+```
+
+## Additional use cases
+
+
+### Node replay
+
+The script also supports the node replay feature for GEMMs and CONVs via `-r` flag, in which case:
+
+- Node replay is used for running high-level microbenchmarking with torch for GEMMs and CONVs that contribute 99pct to the group-specific execution time
+- hipblaslt-bench is used for running low-level microbenchmarking for the GEMMS
+- MIOpenDriver is used for running low-level microbenchmarking for the CONVs
+- The high- and low-level microbenchmarking results are collected and compared against metrics calculated from the actual workload
+- The identified and benchmarked GEMMs and CONVs are saved to their individual files for later possible use, e.g. tuning