import argparse from collections import namedtuple import json import pandas as pd try: import hatchet as ht from hatchet.query import NegationQuery except ImportError: raise ImportError("Failed to import hatchet. `pip install llnl-hatchet` to get the correct version.") import numpy as np from triton.profiler.hooks.launch import COMPUTE_METADATA_SCOPE_NAME, LaunchHook from triton.profiler import specs def match_available_metrics(metrics, inclusive_metrics, exclusive_metrics): ret = [] if not isinstance(metrics, list): metrics = [metrics] if metrics: for metric in metrics: metric = metric.lower() for raw_metric in inclusive_metrics + exclusive_metrics: suffix = " (inc)" if raw_metric in inclusive_metrics else "" raw_metric_no_unit = raw_metric.split("(")[0].strip().lower() if metric in (raw_metric, raw_metric_no_unit): ret.append(raw_metric + suffix) break if len(ret) == 0: raise RuntimeError(f"Metric {metric} is not found. Use the --list flag to list available metrics") return ret def remove_frames(database: json): # We first fine frames that match either one of the two conditions: # 1. The frame name is COMPUTE_METADATA_SCOPE_NAME # 2. The frame has no metrics and no children # Then we go up from the located nodes and remove the parents if all children were # metadata nodes def remove_frame_helper(node): if "frame" not in node: return node if node["frame"]["name"] == COMPUTE_METADATA_SCOPE_NAME: return None if len(node["metrics"]) == 0 and len(node["children"]) == 0: return None children = node.get("children", []) new_children = [] for child in children: new_child = remove_frame_helper(child) if new_child is not None: new_children.append(new_child) if len(new_children) > 0 or len(children) == 0: node["children"] = new_children return node return None new_database = [] for node in database: new_node = remove_frame_helper(node) if new_node is not None: new_database.append(new_node) return new_database def get_raw_metrics(file): database = json.load(file) database = remove_frames(database) device_info = database.pop(1) gf = ht.GraphFrame.from_literal(database) inclusive_metrics = gf.show_metric_columns() exclusive_metrics = [metric for metric in gf.dataframe.columns if metric not in inclusive_metrics] return gf, inclusive_metrics, exclusive_metrics, device_info def get_min_time_flops(df, device_info): min_time_flops = pd.DataFrame(0.0, index=df.index, columns=["min_time"]) for device_type in device_info: for device_index in device_info[device_type]: arch = device_info[device_type][device_index]["arch"] num_sms = device_info[device_type][device_index]["num_sms"] clock_rate = device_info[device_type][device_index]["clock_rate"] for width in LaunchHook.flops_width: idx = df["device_id"] == device_index device_frames = df[idx] if f"flops{width}" not in device_frames.columns: continue max_flops = specs.max_flops(device_type, arch, width, num_sms, clock_rate) min_time_flops.loc[idx, "min_time"] += device_frames[f"flops{width}"].fillna(0) / max_flops return min_time_flops def get_min_time_bytes(df, device_info): min_time_bytes = pd.DataFrame(0.0, index=df.index, columns=["min_time"]) for device_type in device_info: for device_index in device_info[device_type]: idx = df["device_id"] == device_index device_frames = df[idx] device = device_info[device_type][device_index] memory_clock_rate = device["memory_clock_rate"] # in khz bus_width = device["bus_width"] # in bits peak_bandwidth = specs.max_bps(device_type, device['arch'], bus_width, memory_clock_rate) min_time_bytes.loc[idx, "min_time"] += device_frames["bytes"] / peak_bandwidth return min_time_bytes FactorDict = namedtuple("FactorDict", ["name", "factor"]) time_factor_dict = FactorDict("time", {"time/s": 1, "time/ms": 1e-3, "time/us": 1e-6, "time/ns": 1e-9}) avg_time_factor_dict = FactorDict("avg_time", {f"avg_{key}": value for key, value in time_factor_dict.factor.items()}) cpu_time_factor_dict = FactorDict("cpu_time", {"cpu_time/s": 1, "cpu_time/ms": 1e-3, "cpu_time/us": 1e-6, "cpu_time/ns": 1e-9}) avg_cpu_time_factor_dict = FactorDict("avg_cpu_time", {f"avg_{key}": value for key, value in cpu_time_factor_dict.factor.items()}) bytes_factor_dict = FactorDict("bytes", {"byte/s": 1, "gbyte/s": 1e9, "tbyte/s": 1e12}) derivable_metrics = { **{key: bytes_factor_dict for key in bytes_factor_dict.factor.keys()}, } # FLOPS have a specific width to their metric default_flop_factor_dict = {"flop/s": 1, "gflop/s": 1e9, "tflop/s": 1e12} derivable_metrics.update( {key: FactorDict("flops", default_flop_factor_dict) for key in default_flop_factor_dict.keys()}) for width in LaunchHook.flops_width: factor_name = f"flops{width}" factor_dict = {f"flop{width}/s": 1, f"gflop{width}/s": 1e9, f"tflop{width}/s": 1e12} derivable_metrics.update({key: FactorDict(factor_name, factor_dict) for key in factor_dict.keys()}) def derive_metrics(gf, metrics, inclusive_metrics, exclusive_metrics, device_info): derived_metrics = [] def get_time_seconds(df, metric, factor_dict): time_metric_name = match_available_metrics(metric, inclusive_metrics, exclusive_metrics)[0] time_unit = factor_dict.name + "/" + time_metric_name.split("(")[1].split(")")[0] return df[time_metric_name] * factor_dict.factor[time_unit] for metric in metrics: if metric == "util": # exclusive min_time_bytes = get_min_time_bytes(gf.dataframe, device_info) min_time_flops = get_min_time_flops(gf.dataframe, device_info) time_sec = get_time_seconds(gf.dataframe, "time", time_factor_dict) internal_frame_indices = gf.dataframe["device_id"].isna() gf.dataframe["util"] = min_time_flops["min_time"].combine(min_time_bytes["min_time"], max) / time_sec gf.dataframe.loc[internal_frame_indices, "util"] = np.nan derived_metrics.append("util") elif metric in derivable_metrics: # flop/s, byte/s, inclusive derivable_metric = derivable_metrics[metric] metric_name = derivable_metric.name metric_factor_dict = derivable_metric.factor matched_metric_name = match_available_metrics(metric_name, inclusive_metrics, exclusive_metrics)[0] gf.dataframe[f"{metric} (inc)"] = (gf.dataframe[matched_metric_name] / (get_time_seconds(gf.dataframe, "time", time_factor_dict)) / metric_factor_dict[metric]) derived_metrics.append(f"{metric} (inc)") elif (metric in time_factor_dict.factor or metric in cpu_time_factor_dict.factor or metric in avg_time_factor_dict.factor or metric in avg_cpu_time_factor_dict.factor): # inclusive is_cpu = metric in cpu_time_factor_dict.factor or metric in avg_cpu_time_factor_dict.factor is_avg = metric in avg_time_factor_dict.factor or metric in avg_cpu_time_factor_dict.factor factor_dict = ((avg_cpu_time_factor_dict if is_avg else cpu_time_factor_dict) if is_cpu else (avg_time_factor_dict if is_avg else time_factor_dict)) metric_name = "cpu_time" if is_cpu else "time" metric_time_unit = factor_dict.name + "/" + metric.split("/")[1] time_value = get_time_seconds(gf.dataframe, metric_name, factor_dict) if is_avg: time_value = time_value / gf.dataframe["count (inc)"] gf.dataframe[f"{metric} (inc)"] = time_value / factor_dict.factor[metric_time_unit] derived_metrics.append(f"{metric} (inc)") else: metric_name_and_unit = metric.split("/") metric_name = metric_name_and_unit[0] if len(metric_name_and_unit) > 1: # percentage, exclusive or inclusive metric_unit = metric_name_and_unit[1] if metric_unit != "%": raise ValueError(f"Unsupported unit {metric_unit}") matched_metric_name = match_available_metrics(metric_name, inclusive_metrics, exclusive_metrics)[0] single_frame = gf.dataframe[matched_metric_name] suffix = "" if "(inc)" in matched_metric_name: suffix = " (inc)" total = gf.dataframe[matched_metric_name].iloc[0] else: total = gf.dataframe[matched_metric_name].sum() gf.dataframe[metric + suffix] = (single_frame / total) * 100.0 derived_metrics.append(metric + suffix) else: matched_metric_name = match_available_metrics(metric_name, inclusive_metrics, exclusive_metrics)[0] derived_metrics.append(matched_metric_name) # Update derived metrics to the graph frame for derived_metric in derived_metrics: if derived_metric.endswith("(inc)"): gf.inc_metrics.append(derived_metric) else: gf.exc_metrics.append(derived_metric) return derived_metrics def format_frames(gf, format): if format == "file_function_line": gf.dataframe["name"] = gf.dataframe["name"].apply(lambda x: x.split("/")[-1]) elif format == "function_line": gf.dataframe["name"] = gf.dataframe["name"].apply(lambda x: x.split(":")[-1]) elif format == "file_function": gf.dataframe["name"] = gf.dataframe["name"].apply( lambda x: f"{x.split('/')[-1].split(':')[0]}@{x.split('@')[-1].split(':')[0]}") return gf def filter_frames(gf, include=None, exclude=None, threshold=None, metric=None): if include: query = f""" MATCH ("*")->(".", p)->("*") WHERE p."name" =~ "{include}" """ gf = gf.filter(query, squash=True) if exclude: inclusion_query = f""" MATCH (".", p)->("*") WHERE p."name" =~ "{exclude}" """ query = NegationQuery(inclusion_query) gf = gf.filter(query, squash=True) if threshold: query = ["*", {metric: f">= {threshold}"}] gf = gf.filter(query, squash=True) return gf def emit_warnings(gf, metrics): if "bytes (inc)" in metrics: byte_values = gf.dataframe["bytes (inc)"].values min_byte_value = np.nanmin(byte_values) if min_byte_value < 0: print("Warning: Negative byte values detected, this is usually the result of a datatype overflow\n") def print_tree(gf, metrics, depth=100, format=None, print_sorted=False): gf = format_frames(gf, format) print(gf.tree(metric_column=metrics, expand_name=True, depth=depth, render_header=False)) if print_sorted: print("Sorted kernels by metric " + metrics[0]) sorted_df = gf.dataframe.sort_values(by=[metrics[0]], ascending=False) for row in range(1, len(sorted_df)): kernel_name = (sorted_df.iloc[row]["name"][:100] + "..." if len(sorted_df.iloc[row]["name"]) > 100 else sorted_df.iloc[row]["name"]) print("{:105} {:.4}".format(kernel_name, sorted_df.iloc[row][metrics[0]])) emit_warnings(gf, metrics) def read(filename): with open(filename, "r") as f: gf, inclusive_metrics, exclusive_metrics, device_info = get_raw_metrics(f) assert len(inclusive_metrics + exclusive_metrics) > 0, "No metrics found in the input file" gf.update_inclusive_columns() return gf, inclusive_metrics, exclusive_metrics, device_info def parse(metrics, filename, include=None, exclude=None, threshold=None): gf, inclusive_metrics, exclusive_metrics, device_info = read(filename) metrics = derive_metrics(gf, metrics, inclusive_metrics, exclusive_metrics, device_info) # TODO: generalize to support multiple metrics, not just the first one gf = filter_frames(gf, include, exclude, threshold, metrics[0]) return gf, metrics def show_metrics(file_name): with open(file_name, "r") as f: _, inclusive_metrics, exclusive_metrics, _ = get_raw_metrics(f) print("Available inclusive metrics:") if inclusive_metrics: for raw_metric in inclusive_metrics: raw_metric_no_unit = raw_metric.split("(")[0].strip().lower() print(f"- {raw_metric_no_unit}") print("Available exclusive metrics:") if exclusive_metrics: for raw_metric in exclusive_metrics: raw_metric_no_unit = raw_metric.split("(")[0].strip().lower() print(f"- {raw_metric_no_unit}") def main(): argparser = argparse.ArgumentParser( description="Performance data viewer for proton profiles.", formatter_class=argparse.RawTextHelpFormatter, ) argparser.add_argument( "-l", "--list", action="store_true", help="""List available metrics. Metric names are case insensitive and ignore units. Derived metrics can be created when source metrics are available. - time/s, time/ms, time/us, time/ns: time - avg_time/s, avg_time/ms, avg_time/us, avg_time/ns: time / count - flop[<8/16/32/64>]/s, gflop[<8/16/32/64>]/s, tflop[<8/16/32/64>]/s: flops / time - byte/s, gbyte/s, tbyte/s: bytes / time - util: max(sum(flops) / peak_flops_time, sum(bytes) / peak_bandwidth_time) - /%%: frame(metric) / sum(metric). Only available for inclusive metrics (e.g. time) """, ) argparser.add_argument( "-m", "--metrics", type=str, default=None, help="""At maximum two metrics can be specified, separated by comma. There are two modes: 1) Choose the output metric to display. It's case insensitive and ignore units. 2) Derive a new metric from existing metrics. """, ) argparser.add_argument( "-i", "--include", type=str, default=None, help= """Find frames that match the given regular expression and return all nodes in the paths that pass through the matching frames. For example, the following command will display all paths that contain frames that contains "test": ``` proton-viewer -i ".*test.*" path/to/file.json ``` """, ) argparser.add_argument( "-e", "--exclude", type=str, default=None, help="""Exclude frames that match the given regular expression and their children. For example, the following command will exclude all paths starting from frames that contains "test": ``` proton-viewer -e ".*test.*" path/to/file.json ``` """, ) argparser.add_argument( "-t", "--threshold", type=float, default=None, help= "Exclude frames(kernels) whose metrics are below the given threshold. This filter only applies on the first metric.", ) argparser.add_argument( "-d", "--depth", type=int, default=100, help="The depth of the tree to display", ) argparser.add_argument( "-f", "--format", type=str, choices=["full", "file_function_line", "function_line", "file_function"], default="full", help="""Formatting the frame name. - full: include the path, file name, function name and line number. - file_function_line: include the file name, function name and line number. - function_line: include the function name and line number. - file_function: include the file name and function name. """, ) argparser.add_argument( "--print-sorted", action="store_true", default=False, help="Sort output by metric value instead of chronologically", ) argparser.add_argument( "--diff-profile", "-diff", type=str, default=None, help="Compare two profiles. When used as 'proton-viewer -m time -diff file1.log file2.log', " "computes the difference: file2['time'] - file1['time']", ) args, target_args = argparser.parse_known_args() assert len(target_args) == 1, "Must specify a file to read" file_name = target_args[0] metrics = args.metrics.split(",") if args.metrics else None include = args.include exclude = args.exclude threshold = args.threshold depth = args.depth format = args.format diff = args.diff_profile print_sorted = args.print_sorted if include and exclude: raise ValueError("Cannot specify both include and exclude") if args.list: show_metrics(file_name) elif metrics: gf, derived_metrics = parse(metrics, file_name, include, exclude, threshold) if diff: gf2, _ = parse(metrics, diff, include, exclude, threshold) gf = gf.sub(gf2) print_tree(gf, derived_metrics, depth, format, print_sorted) if __name__ == "__main__": main()