Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
0c3b545
snapshot, first version gen perf report + node replay microbench + re…
eppaneamd May 2, 2025
1e28e47
org imports + formatting + rm redundant code
May 3, 2025
8091e18
add naive grouping of triton kernels
lauri9 May 21, 2025
040b5fe
add option to save to user specified xlsx
lauri9 May 21, 2025
ce735d1
add upsample and norm ops
lauri9 May 21, 2025
eaedb6a
fix import
lauri9 May 21, 2025
9c34288
ensure single base dir when custom xlsx path used
eppaneamd May 22, 2025
954572f
node replay, ensure single base dir when custom xlsx path is used + l…
eppaneamd May 22, 2025
31dc620
first readme for generate_perf_report script
May 28, 2025
ab8b59e
perf report readme minor fix
May 28, 2025
46290f4
sort traces by rank
eppaneamd May 28, 2025
129181e
rename var
eppaneamd May 28, 2025
bb73715
fix error on ambiguous truth value of dataframe
lauri9 Jun 3, 2025
730ce95
warn about kernel launchers in group other
lauri9 Jun 12, 2025
4027b93
disable strict metadata checking
lauri9 Jun 19, 2025
a9313c1
add missing upsample ops, aiter FA launchers
lauri9 Jul 29, 2025
e1c2f57
fix: disable strict metadata checking also here
lauri9 Jul 31, 2025
38ec716
add efficient_attention sdpa
lauri9 Jul 31, 2025
47a10dc
add flash_attn_3::fwd to grouping
lauri9 Aug 19, 2025
7c32c56
fix: install packages required for report generation
lauri9 Aug 27, 2025
382622d
add missing aiter attn launcher for grouping
lauri9 Sep 9, 2025
57057b0
Revert "fix: install packages required for report generation"
lauri9 Sep 10, 2025
26ef173
drop psutil dependency
lauri9 Sep 10, 2025
f0bba7a
add openpyxl dependency for report generation
lauri9 Sep 10, 2025
357f05d
fix to adapt to tracelens update
lauri9 Sep 16, 2025
e2ac22e
rm FlashAttnFunc perf model + xFuserRingFlashAttnFunc kernel launcher
eppaneamd Sep 24, 2025
498a5e2
add aiter::wrapper_fmha_v3_fwd to attn sheet
lauri9 Sep 25, 2025
e7daf9e
update node replay description (WIP)
eppaneamd Oct 31, 2025
1d4d8e7
Merge branch 'main' into feat/gen_report_node_replay_bench
eppaneamd Nov 5, 2025
6a016e8
process traces in parallel + rm save all kernels + improve logging wh…
eppaneamd Nov 5, 2025
f82df74
update configs + improve failure log on group perf metrics
eppaneamd Nov 5, 2025
19b3119
fix imports
eppaneamd Nov 5, 2025
2e25749
nccl analyzer load traces in parallel
eppaneamd Nov 5, 2025
aa23222
avoid passing instance methods to worker processes
eppaneamd Nov 5, 2025
65c27ce
quickfix
eppaneamd Nov 5, 2025
3f92cf9
add (un)linked short kernel sheets
eppaneamd Nov 5, 2025
a760786
add configs
eppaneamd Nov 5, 2025
6b48c43
quickfix
eppaneamd Nov 5, 2025
0d512e3
defaultdict
eppaneamd Nov 5, 2025
9c78787
check if unlinked or short cpu ops are found
eppaneamd Nov 5, 2025
b8ae50c
short cpu op counts index true
eppaneamd Nov 5, 2025
fad83c2
refactor
eppaneamd Nov 5, 2025
b9e84f5
update kernel categories
eppaneamd Nov 6, 2025
b821188
cuda events summary pivot table
eppaneamd Nov 6, 2025
dcf2014
collect parent child hieracrhy in perf metrics per group
eppaneamd Nov 6, 2025
e045e7c
quickfixes
eppaneamd Nov 6, 2025
279ffe8
pass rank to group perf metrics
eppaneamd Nov 6, 2025
b2a24e9
standalone node replay process traces in parallel
eppaneamd Nov 6, 2025
d181de3
add node replay deps check
eppaneamd Nov 6, 2025
6752e4c
black format
eppaneamd Nov 6, 2025
28b4c26
isort
eppaneamd Nov 6, 2025
5daad2f
black format
eppaneamd Nov 6, 2025
0d7baa5
black format
eppaneamd Nov 6, 2025
1a30937
quickfix
eppaneamd Nov 6, 2025
f0bff09
rm torch dep from module level
eppaneamd Nov 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 39 additions & 17 deletions TraceLens/NcclAnalyser/nccl_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import gzip
import logging

from concurrent.futures import ProcessPoolExecutor
from ..util import DataLoader


Expand All @@ -21,6 +22,26 @@ def list_to_tuple(obj):
return obj


def nccl_filter_event_fn(event):
"""Filters NCCL kernel events."""
is_nccl_kernel = (
event.get("cat") == "kernel" and "nccl" in event.get("name", "").lower()
)
is_linked = event.get("args", {}).get("External id") is not None
return is_nccl_kernel and is_linked


def load_single_trace_fn(args):
"""Worker function to load a single trace file."""
rank, filepath = args
raw_data = DataLoader.load_data(filepath)

nccl_events = [e for e in raw_data["traceEvents"] if nccl_filter_event_fn(e)]

rank_dict = {idx: evt for idx, evt in enumerate(nccl_events)}
return rank, rank_dict


class NcclAnalyser:
def __init__(self, list_profile_filepaths, world_size):
self.logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -74,20 +95,12 @@ def __init__(self, list_profile_filepaths, world_size):
}
self.implicit_sync_cat = {"allreduce", "reducescatter", "allgather", "alltoall"}
# Filter function: keep only kernel events with "nccl" in the name
self.filter_event_fn = self._nccl_filter_event_fn
self.filter_event_fn = nccl_filter_event_fn

# Internal storage
self.rank2trace_data = {} # Stores per-rank data
self.load_trace_data()

def _nccl_filter_event_fn(self, event):
"""Filters NCCL kernel events."""
is_nccl_kernel = (
event.get("cat") == "kernel" and "nccl" in event.get("name", "").lower()
)
is_linked = event.get("args", {}).get("External id") is not None
return is_nccl_kernel and is_linked

def load_trace_data(self):
"""Loads NCCL JSON trace data and extracts relevant events."""
self.logger.warning(
Expand All @@ -97,16 +110,25 @@ def load_trace_data(self):
"Also note that we need all ranks for the analysis. We will add a fallback soon for lesser features for single rank or partial data."
)
self.rank2trace_data.clear()
for rank, filepath in enumerate(self.list_profile_filepaths):
self.logger.info(f"Loading rank {rank} from {filepath}")
raw_data = DataLoader.load_data(filepath)
# Prepare arguments for parallel processing
process_args = [
(rank, filepath)
for rank, filepath in enumerate(self.list_profile_filepaths)
]

nccl_events = [
e for e in raw_data["traceEvents"] if self._nccl_filter_event_fn(e)
]
# Determine number of workers (limit to avoid overwhelming system)
max_workers = min(len(self.list_profile_filepaths), 8)

# Load traces in parallel
self.logger.info(
f"Loading {len(self.list_profile_filepaths)} traces in parallel with {max_workers} workers"
)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(load_single_trace_fn, process_args))

# Build a dictionary with event data
rank_dict = {idx: evt for idx, evt in enumerate(nccl_events)}
# Store results
for rank, rank_dict in results:
self.logger.info(f"Loaded rank {rank} with {len(rank_dict)} NCCL events")
self.rank2trace_data[rank] = rank_dict

# ------------------------------------------------------------------------
Expand Down
19 changes: 19 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Examples

Collection of example workflows when using TraceLens:

- [compare_kernel_launchers.ipynb](./compare_kernel_launchers.ipynb)
- [event_replayer_example.ipynb](./event_replayer_example.ipynb)
- [gemm_dim_eff.ipynb](./gemm_dim_eff.ipynb)
- [generate_perf_report.py](./generate_perf_report.py)
- [nccl_analyzer_example.ipynb](./nccl_analyser_example.ipynb)
- [nn_module_view.ipynb](./nn_module_view.ipynb)
- [roofline_plots_example.ipynb](./roofline_plots_example.ipynb)
- [trace_fusion_example.py](./trace_fusion_example.py)
- [tree_perf_example.ipynb](./tree_perf_example.ipynb)

Collection of custom workflows when using TraceLens:

- [fusion_opportunity.ipynb](./custom_workflows/fusion_opportunity.ipynb)
- [generate_perf_report.py](./custom_workflows/generate_perf_report.py)
- [node_replay.py](./custom_workflows/node_replay.py)
99 changes: 99 additions & 0 deletions examples/custom_workflows/generate_perf_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Enhanced Performance Report Generator

[generate_perf_report.py](./generate_perf_report.py) script produces a performance report in Excel format from traces produced by torch profiler. Works for both single- and multi-GPU traces.

Traces are located recursively and by default, the outputs are saved in the same directory where the traces are located.

## Arguments

```
Flag Type Required Default Description
-b str True Path to base directory which contains profiling experiments as subdirectories
-p str False "rank_" Pattern to use for finding the rank of a trace from filename. Supports <string><sep> where separator can be empty, - or _
-e str False "json" Extension to use for identifying trace files. json and gz are supported
-f list<str> False ["rank_0"] Select files containing given substring(s) in their full filepaths
-r bool False False Run node replay for GEMMs and CONVs that contribute 99pct to group-specific execution time
-d bool False False Dry run for checking if correct trace paths are found
-a bool False False Save all individual kernels from all ranks (sheets kernels_0 ... kernels_n). Produces a lot of data
-o str False None Filepath to save the Excel performance report. Note that this works only with a single base/parent directory containing one set of traces
```

## How to use

Following directory tree is an example from a 8-GPU distributed inference setup where traces have been produced for 4 different configurations.

```
profiles
013_profile_544p_bs1
traces_rank_0_step_8.json
traces_rank_1_step_8.json
...
traces_rank_7_step_8.json
013_profile_720p_bs1
traces_rank_0_step_8.json
traces_rank_1_step_8.json
...
traces_rank_7_step_8.json
014_profile_544p_bs1
traces_rank_0_step_8.json
traces_rank_1_step_8.json
...
traces_rank_7_step_8.json
014_profile_720p_bs1
traces_rank_0_step_8.json
traces_rank_1_step_8.json
...
traces_rank_7_step_8.json
```

Generate performance reports for all configurations in one go using traces from all ranks:

```
python <path to generate_perf_report.py> -b results -f step_8
```

Then following files will be generated:

```
profiles/013_profile_544p_bs1/013_profile_544p_bs1_step_8_performance_report.xlsx
profiles/013_profile_720p_bs1/013_profile_720p_bs1_step_8_performance_report.xlsx
profiles/014_profile_544p_bs1/014_profile_544p_bs1_step_8_performance_report.xlsx
profiles/014_profile_720p_bs1/014_profile_720p_bs1_step_8_performance_report.xlsx
```

Generate performance report for one configuration using traces from all ranks:

```
# User-defined excel paths can be passed via -o flag

# Narrow down search using base path
python <path to generate_perf_report.py> -b results/013_profile_544p_bs1 -f step_8
```

Generate performance reports for single ranks:

```
python <path to generate_perf_report.py> -b results -f rank_0 step_8
```

Then following files will be generated:

```
profiles/013_profile_544p_bs1/013_profile_544p_bs1_rank_0_step_8_performance_report.xlsx
profiles/013_profile_720p_bs1/013_profile_720p_bs1_rank_0_step_8_performance_report.xlsx
profiles/013_profile_544p_bs1/013_profile_544p_bs1_rank_0_step_8_performance_report.xlsx
profiles/013_profile_720p_bs1/013_profile_720p_bs1_rank_0_step_8_performance_report.xlsx
```

## Additional use cases


### Node replay

The script also supports the node replay feature for GEMMs and CONVs via `-r` flag, in which case:

- Node replay is used for running high-level microbenchmarking with torch for GEMMs and CONVs that contribute 99pct to the group-specific execution time
- hipblaslt-bench is used for running low-level microbenchmarking for the GEMMS
- MIOpenDriver is used for running low-level microbenchmarking for the CONVs
- The high- and low-level microbenchmarking results are collected and compared against metrics calculated from the actual workload
- The identified and benchmarked GEMMs and CONVs are saved to their individual files for later possible use, e.g. tuning
Loading
Loading