From 4392eea1522ee6d3d9714e973282f14aad7103e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Mon, 18 Nov 2024 14:06:02 +0100 Subject: [PATCH 1/3] Adds information related to NCCL calls in NVTX payload. --- .vscode/launch.json | 17 +- cfgs/nccl_size.cfg | 125 +++++++++++++++ nsys2prv/NSYSInterface.py | 4 +- nsys2prv/parse_nsys_stats.py | 145 ++++++++++++++---- nsys2prv/scripts/nvtx_pushpop_simple.sql | 52 +++++++ nsys2prv/scripts/nvtx_pushpop_trace.sql | 52 +++++++ .../scripts/nvtx_pushpop_trace_prepare.sql | 130 ++++++++++++++++ nsys2prv/scripts/nvtx_startend_trace.sql | 2 +- nsys2prv/semantics/__init__.py | 4 +- nsys2prv/semantics/nsys_event.py | 5 + nsys2prv/semantics/nvtx_pushpop_semantic.py | 20 +++ .../semantics/nvtx_pushpop_simple_semantic.py | 16 ++ nsys2prv/semantics/nvtx_startend_semantic.py | 6 +- pyproject.toml | 2 +- 14 files changed, 537 insertions(+), 43 deletions(-) create mode 100644 cfgs/nccl_size.cfg create mode 100644 nsys2prv/scripts/nvtx_pushpop_simple.sql create mode 100644 nsys2prv/scripts/nvtx_pushpop_trace.sql create mode 100644 nsys2prv/scripts/nvtx_pushpop_trace_prepare.sql create mode 100644 nsys2prv/semantics/nvtx_pushpop_semantic.py create mode 100644 nsys2prv/semantics/nvtx_pushpop_simple_semantic.py diff --git a/.vscode/launch.json b/.vscode/launch.json index 95c3117..4977ead 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,20 +12,15 @@ "console": "integratedTerminal", "args": [ "-t", - "cuda_api_trace,mpi_event_trace,gpu_metrics", + "cuda_api_trace,nvtx_pushpop_trace,nccl", + //"--force-sqlite", "-m", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_0.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_1.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_2.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_3.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_4.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_5.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_6.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_7.nsys-rep", - "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_ricardo_metrics_4nodes_more" + "/home/mclasca/Documents/BePPP/heka/profiles/llama-aloebeta-smaller/heka-axolotl-Oct-limitnvtx2-Llama8B0.1-8node_0_-10531139.nsys-rep", + "/home/mclasca/Documents/BePPP/heka/profiles/llama-aloebeta-smaller/heka-axolotl-Oct-limitnvtx2-Llama8B0.1-8node_1_-10531139.nsys-rep", + "/home/mclasca/Documents/BePPP/heka/profiles/llama-aloebeta-smaller/with-nccl-test", ], "env": { - "NSYS_HOME": "/home/mclasca/Apps/nsight-system/2024.5.1" + "NSYS_HOME": "/home/mclasca/Apps/nsight-system/2024.6.1" } } ] diff --git a/cfgs/nccl_size.cfg b/cfgs/nccl_size.cfg new file mode 100644 index 0000000..9b68730 --- /dev/null +++ b/cfgs/nccl_size.cfg @@ -0,0 +1,125 @@ +#ParaverCFG +ConfigFile.Version: 3.4 +ConfigFile.NumWindows: 3 +ConfigFile.BeginDescription + +ConfigFile.EndDescription + +################################################################################ +< NEW DISPLAYING WINDOW Last Collective size > +################################################################################ +window_name Last Collective size +window_type single +window_id 1 +window_position_x 407 +window_position_y 181 +window_width 600 +window_height 195 +window_comm_lines_enabled false +window_flags_enabled false +window_noncolor_mode true +window_color_mode window_in_null_gradient_mode +window_gradient_function gradient_function_linear +window_custom_color_enabled false +window_semantic_scale_min_at_zero false +window_logical_filtered true +window_physical_filtered false +window_comm_fromto true +window_comm_tagsize true +window_comm_typeval true +window_units Microseconds +window_maximum_y 4194304.000000000000 +window_minimum_y 0.000000000000 +window_compute_y_max false +window_level thread +window_scale_relative 1.000000000000 +window_end_time_relative 1.000000000000 +window_object appl { 1, { All } } +window_begin_time_relative 0.000000000000 +window_open false +window_drawmode draw_last +window_drawmode_rows draw_last +window_pixel_size 1 +window_labels_to_draw 1 +window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Adding}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, None}, {to_obj, None}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } +window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, As Is}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } +window_filter_module evt_type 1 9501 +window_filter_module evt_type_label 1 "NCCL Message size [bytes]" + +################################################################################ +< NEW DISPLAYING WINDOW In collective > +################################################################################ +window_name In collective +window_type single +window_id 2 +window_position_x 236 +window_position_y 278 +window_width 600 +window_height 195 +window_comm_lines_enabled false +window_flags_enabled false +window_noncolor_mode true +window_custom_color_enabled false +window_semantic_scale_min_at_zero false +window_logical_filtered true +window_physical_filtered false +window_comm_fromto true +window_comm_tagsize true +window_comm_typeval true +window_units Microseconds +window_maximum_y 1.000000000000 +window_minimum_y 0.000000000000 +window_compute_y_max false +window_level thread +window_scale_relative 1.000000000000 +window_end_time_relative 1.000000000000 +window_object appl { 1, { All } } +window_begin_time_relative 0.000000000000 +window_open false +window_drawmode draw_last +window_drawmode_rows draw_last +window_pixel_size 1 +window_labels_to_draw 1 +window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Adding}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, None}, {to_obj, None}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } +window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, Sign}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } +window_filter_module evt_type 1 63000007 +window_filter_module evt_type_label 1 "NCCL kernel" + +################################################################################ +< NEW DISPLAYING WINDOW NCCL message size > +################################################################################ +window_name NCCL message size +window_type composed +window_id 3 +window_factors 1.000000000000 1.000000000000 +window_shifts 0 0 +window_operation controlled: clear by +window_identifiers 1 2 +window_position_x 413 +window_position_y 379 +window_width 945 +window_height 244 +window_comm_lines_enabled false +window_flags_enabled false +window_noncolor_mode true +window_color_mode window_in_null_gradient_mode +window_gradient_function gradient_function_linear +window_custom_color_enabled false +window_semantic_scale_min_at_zero false +window_units Microseconds +window_maximum_y 32834048.000000000000 +window_minimum_y 262144.000000000000 +window_compute_y_max false +window_level thread +window_scale_relative 0.732426303855 +window_end_time_relative 0.732426303855 +window_object appl { 1, { All } } +window_begin_time_relative 0.480725623583 +window_open true +window_drawmode draw_maximum +window_drawmode_rows draw_maximum +window_pixel_size 1 +window_labels_to_draw 1 +window_selected_functions { 5, { {appl, Adding}, {task, Adding}, {node, Adding}, {system, Adding}, {workload, Adding}, } } +window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, As Is}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } + diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py index 9034cf7..4db3ed5 100644 --- a/nsys2prv/NSYSInterface.py +++ b/nsys2prv/NSYSInterface.py @@ -24,7 +24,7 @@ class NSYSInterface(): def check_export_report(self, rf): if not os.path.exists(f"{os.path.splitext(os.path.basename(rf))[0]}.sqlite") or self.force: #Try exporting first - export_call = self.nsys_binary + ("export", "-t", "sqlite", rf) + export_call = self.nsys_binary + ("export", "-t", "sqlite", "--force-overwrite", "true", "--include-json", "true", rf) try: with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: for line in p.stdout: @@ -42,7 +42,7 @@ class NSYSInterface(): def call_stats(self, report): nsys_call = self.nsys_binary + ("stats", "-r", ",".join(self.types), "--timeunit", "nsec", "-f", "csv", - "--force-overwrite", "true", "-o", ".") + "-o", ".") if self.filter: nsys_call += ("--filter-nvtx="+self.range_nvtx,) diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 88d953f..6f34365 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -8,20 +8,21 @@ import time import subprocess import os import locale +import json from functools import reduce from sqlalchemy import create_engine, text, dialects from sqlalchemy.exc import OperationalError -from .EventWriter import event_writer as ewr -from .NSYSInterface import NSYSInterface -from .semantics.mpi_event_encoding import * -from .semantics import * +from EventWriter import event_writer as ewr +from NSYSInterface import NSYSInterface +from semantics.mpi_event_encoding import * +from semantics import * def main(): locale.setlocale(locale.LC_ALL, '') class ShowVersion(argparse.Action): def __call__(self, parser, namespace, values, option_string): - print("nsys2prv v0.4.0-dev20241107b - prerelease") + print("nsys2prv v0.4.0-dev202411071 - prerelease") print("export SQLite schema version compatibility version 3.11.0") parser.exit() # exits the program with no more arg parsing and checking @@ -32,7 +33,7 @@ def main(): parser.add_argument("-v", "--version", nargs=0, help="Show version and exit.", action=ShowVersion) parser.add_argument("-f", "--filter-nvtx", help="Filter by this NVTX range") - parser.add_argument("-t", "--trace", help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, gpu_metrics, openacc]") + parser.add_argument("-t", "--trace", help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, gpu_metrics, openacc, nccl]") parser.add_argument("-m", "--multi-report", action="store_true", help="Translate multiple reports of the same execution into one trace.") parser.add_argument("--force-sqlite", action="store_true", help="Force Nsight System to export SQLite database") @@ -76,8 +77,11 @@ def main(): t_mpi = False t_metrics = False t_openacc = False + t_nccl = False - if "nvtx_pushpop_trace" in reports: t_nvtx = True + if "nvtx_pushpop_trace" in reports: + t_nvtx = True + reports.remove("nvtx_pushpop_trace") if "cuda_api_trace" in reports: t_apicalls = True if "mpi_event_trace" in reports: t_mpi = True @@ -91,8 +95,12 @@ def main(): if "openacc" in reports: t_openacc = True reports.remove("openacc") + if "nccl" in reports: + t_nccl = True + reports.remove("nccl") event_type_kernels = 63000006 + event_type_nccl_kernels = 63000007 event_type_memcopy_size = 63000002 event_type_api = 63000000 event_type_nvtx = 9003 @@ -189,15 +197,18 @@ def main(): if t_nvtx: nvtx_df = [] if MULTIREPORT: - for i, REPORT_FILE_I in enumerate(REPORTS_LIST): - nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace"))) - nvtx_df[i]["domain"] = nvtx_df[i]["Name"].str.split(":").str[0] - nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) - + for REPORT_FILE_I in REPORTS_LIST: + kpi = NVTXPushPopSimpleSemantic(REPORT_FILE_I) + kpi.Setup() + kpi.load_data() + nvtx_df.append(kpi.get_df()) + del kpi else: - nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace")) - nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0] - nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + kp = NVTXPushPopSimpleSemantic(REPORT_FILE) + kp.Setup() + kp.load_data() + nvtx_df = kp.get_df() + del kp else: nvtx_df = pd.DataFrame() @@ -393,8 +404,8 @@ def main(): if t_nvtx: for i, df in enumerate(nvtx_df): - df['Start (ns)'] += deltas[i] - df['End (ns)'] += deltas[i] + df['Start:ts_ns'] += deltas[i] + df['End:ts_ns'] += deltas[i] nvtx_df = pd.concat(nvtx_df, ignore_index=True) if t_nvtx_startend: @@ -597,7 +608,6 @@ def main(): row_file.write("\n") - # MARK: EVENT NAMES # Second step is collect all different event values for CUDA API calls, kernel names, and NVTX ranges. Each of these define a different event type, and will need unique identifiers to be used as a event values. Finally these needs to be dumped to the PCF file. @@ -627,19 +637,52 @@ def main(): # Remove brackets from names kernel_names["Name"] = kernel_names["Name"].apply(lambda x: x.replace("[", "").replace("]", "")) + # Extract NCCL kernels + nccl_df = kernels_df.loc[kernels_df["Name"].str.contains("nccl")] + mask = ~kernels_df.index.isin(nccl_df.index) + kernels_df = kernels_df.loc[mask] + + nccl_kernel_names = kernel_names.loc[kernel_names["Name"].str.contains("nccl")] + mask = ~kernel_names.index.isin(nccl_kernel_names.index) + kernel_names = kernel_names.loc[mask] + if t_nvtx: nvtx_df_subset = nvtx_df.reset_index() - lower_level = max(nvtx_df["Lvl"]) + # lower_level = max(nvtx_df["Lvl"]) - if nvtx_select_frames: - #subset of df - nvtx_df_subset = nvtx_df[(nvtx_df["Lvl"] >= nvtx_stack_top) & (nvtx_df["Lvl"] <= nvtx_stack_bottom)] + # if nvtx_select_frames: + # #subset of df + # nvtx_df_subset = nvtx_df[(nvtx_df["Lvl"] >= nvtx_stack_top) & (nvtx_df["Lvl"] <= nvtx_stack_bottom)] # split NCCL events - nvtx_nccl_df = nvtx_df_subset[nvtx_df_subset["domain"] == "NCCL"].copy() + nvtx_nccl_df = nvtx_df_subset[nvtx_df_subset["domain"] == "NCCL"].copy().reset_index(drop=True) nvtx_df_subset = nvtx_df_subset.drop(nvtx_df_subset[nvtx_df_subset["domain"] == "NCCL" ].index) nvtx_nccl_df["event_type"] = event_type_nvtx_nccl + if t_nccl: + nvtx_nccl_df["jsonText"] = nvtx_nccl_df["jsonText"].apply(json.loads) + json_expanded = pd.json_normalize(nvtx_nccl_df["jsonText"]).reset_index(drop=True).fillna(0) + nvtx_nccl_df = pd.concat([nvtx_nccl_df.drop(columns=['jsonText']), json_expanded], axis=1) + ## Create dictionary of events for generic payload + + # Step 1: Extract column names as event names and assign event type numbers + nccl_payload_event_dict = {col: 9501 + i for i, col in enumerate(json_expanded.columns)} + nccl_payload_event_dict = pd.DataFrame({ + "Name": list(nccl_payload_event_dict.keys()), + "type": list(nccl_payload_event_dict.values()) + }) + + # Step 2: Identify non-numerical columns and create DataFrames for their unique values + non_numeric_nccl_payloads = {} + + for col in json_expanded.select_dtypes(include='object').columns: + unique_values = json_expanded[col].dropna().unique() + non_numeric_nccl_payloads[col] = pd.DataFrame({ + f"event_name": unique_values, + f"event_value": range(1, len(unique_values) + 1) + }) + + # Now recurring domains, starting with nesmik nvtx_df_subset.loc[nvtx_df_subset["domain"] == "neSmiK", "event_type"] = event_type_nvtx_nesmik nvtx_df_subset["event_type"] = (nvtx_df_subset[nvtx_df_subset["domain"] != "neSmiK"].sort_values("domain").groupby(["domain"]).ngroup() * 100) + event_type_nvtx_base @@ -660,6 +703,7 @@ def main(): if not nvtx_nccl_df.empty: nvtx_nccl_df["event_value"] = nvtx_nccl_df.groupby(["Name"]).ngroup() + 1 nccl_names = nvtx_nccl_df[['event_value', 'Name']].drop_duplicates().sort_values("event_value") + nccl_names["Name"] = nccl_names["Name"].apply(lambda x: x.split(':', 1)[1]) if t_nvtx_startend: nvtx_startend_df["event_value"] = nvtx_startend_df.groupby(["tag"]).ngroup() + 1 @@ -816,6 +860,14 @@ GRADIENT_NAMES pcf_file.write("{} {}\n".format(row["event_value"], row["Name"])) pcf_file.write("\n") + pcf_file.write("EVENT_TYPE\n") + pcf_file.write("0 {} NCCL kernel\n".format(event_type_nccl_kernels)) + pcf_file.write("VALUES\n") + pcf_file.write("0 End\n") + for index, row in nccl_kernel_names.iterrows(): + pcf_file.write("{} {}\n".format(row["event_value"], row["Name"])) + pcf_file.write("\n") + pcf_file.write("EVENT_TYPE\n") for i, v in enumerate(event_types_block_grid_values_names): pcf_file.write("0 {} Kernel {}\n".format(event_types_block_grid_values[i], v)) @@ -846,7 +898,19 @@ GRADIENT_NAMES pcf_file.write("0 End\n") for index, row in nccl_names.iterrows(): pcf_file.write("{} {}\n".format(row["event_value"], row["Name"])) - + pcf_file.write("\n") + if t_nccl: + for i, v in nccl_payload_event_dict.iterrows(): + pcf_file.write("EVENT_TYPE\n") + pcf_file.write("0 {} NCCL {}\n".format(v["type"], v["Name"])) + if v["Name"] in non_numeric_nccl_payloads: + pcf_file.write("VALUES\n") + pcf_file.write("0 End\n") + for index, row in non_numeric_nccl_payloads[v["Name"]].iterrows(): + pcf_file.write("{} {}\n".format(row["event_value"], row["event_name"])) + pcf_file.write("\n") + pcf_file.write("\n") + pcf_file.write("\n") if t_nvtx_startend: pcf_file.write("EVENT_TYPE\n") @@ -929,7 +993,7 @@ GRADIENT_NAMES pcf_file.write("{} {}\n".format(row["func_value"], row["func"])) pcf_file.write("\n") - # MARK: MEMORY + # MARK: MEMORY AND COMMUNICATIONS # # Split of kernel execution between compute and memory memops_names = ["[CUDA memcpy Device-to-Device]", "[CUDA memcpy Device-to-Host]", "[CUDA memcpy Host-to-Device]", "[CUDA memset]", "[CUDA memcpy Peer-to-Peer]"] @@ -942,6 +1006,30 @@ GRADIENT_NAMES comm_kernel_df = cuda_api_df.merge(kernels_df, how="inner", left_on=["CorrID", "task"], right_on=["CorrID", "task"], suffixes=("_call", "_k")) comm_memory_df = cuda_api_df.merge(memops_df, how="inner", left_on=["CorrID", "task"], right_on=["CorrID", "task"], suffixes=("_call", "_mem")) + # # NCCL Communications + if t_nccl: + partial_searches = [] + for ti in tasks_set.index: + # Define function to find the CorrID for events in B contained within events in A + def find_corr_id(row, df_B_task): + # Filter DataFrame B for events that satisfy the conditions + filtered_B = df_B_task[ + (df_B_task["Start (ns)"] >= row["Start:ts_ns"]) & + (df_B_task["Start (ns)"] < (row["End:ts_ns"])) & + (df_B_task["thread"] == row["thread"]) + ] + # Return the first CorrID if matches are found, otherwise None + return filtered_B["CorrID"].iloc[0] if not filtered_B.empty else None + + # Apply the function to each row of DataFrame A + df_A_task = nvtx_nccl_df[nvtx_nccl_df["task"] == ti] + df_B_task = cuda_api_df[(cuda_api_df["Name"].str.contains("LaunchKernel")) & (cuda_api_df["task"] == ti)] + + df_A_task["CorrID"] = df_A_task.apply(find_corr_id, axis=1, df_B_task=df_B_task) + partial_searches.append(df_A_task) + + nvtx_nccl_df = pd.concat(partial_searches, ignore_index=True) + nccl_df = nccl_df.merge(nvtx_nccl_df[nccl_payload_event_dict["Name"].tolist() + ["CorrID"]], how="left", on="CorrID") # MARK: TIMELINE RECONS # # Timeline reconstruction @@ -991,7 +1079,7 @@ GRADIENT_NAMES compute_max_with = [] if t_apicalls: compute_max_with.append((cuda_api_df["Start (ns)"] + cuda_api_df["Duration (ns)"]).max()) - if t_nvtx: compute_max_with.append(nvtx_df["End (ns)"].max()) + if t_nvtx: compute_max_with.append(nvtx_df["End:ts_ns"].max()) if t_nvtx_startend: compute_max_with.append(nvtx_startend_df["end"].max()) if t_mpi: compute_max_with.append(mpi_df["End:ts_ns"].max()) @@ -1009,6 +1097,11 @@ GRADIENT_NAMES ewr(prv_file, kernels_df, "Kernels", lambda r: (create_combined_events_record(r.iloc[0], r.iloc[1], int(r["thread"]), int(r["task"]), types, [r["event_value"]] + [int(r['GrdX']), int(r['GrdY']), int(r['GrdZ']), int(r['BlkX']), int(r['BlkY']), int(r['BlkZ']), int(r['Reg/Trd']), r["CorrID"]]))) + if t_nccl: + types = [event_type_nccl_kernels] + event_types_block_grid_values + [event_type_registers_thread, event_type_correlation] + nccl_payload_event_dict["type"].tolist() + ewr(prv_file, nccl_df, "NCCL Kernels", lambda r: + (create_combined_events_record(r.iloc[0], r.iloc[1], int(r["thread"]), int(r["task"]), types, [r["event_value"]] + [int(r['GrdX']), int(r['GrdY']), int(r['GrdZ']), int(r['BlkX']), int(r['BlkY']), int(r['BlkZ']), int(r['Reg/Trd']), r["CorrID"]] + [r[rowname] for rowname in nccl_payload_event_dict["Name"].tolist()]))) + types_mem = [event_type_kernels, event_type_memcopy_size, event_type_correlation] ewr(prv_file, memops_df, "Memory operations", lambda r: (create_combined_events_record(r.iloc[0], r.iloc[1], int(r["thread"]), int(r["task"]), types_mem, [r["event_value"], r["bytes_b"], r["CorrID"]]))) diff --git a/nsys2prv/scripts/nvtx_pushpop_simple.sql b/nsys2prv/scripts/nvtx_pushpop_simple.sql new file mode 100644 index 0000000..241ad10 --- /dev/null +++ b/nsys2prv/scripts/nvtx_pushpop_simple.sql @@ -0,0 +1,52 @@ +WITH + domains AS ( + SELECT + min(start), + domainId AS id, + globalTid AS globalTid, + text AS name + FROM + NVTX_EVENTS + WHERE + eventType == 75 + GROUP BY 2, 3 + ), + maxts AS( + SELECT max(max(start), max(end)) AS m + FROM NVTX_EVENTS + ), + nvtx AS ( + SELECT + ne.start AS "Start:ts_ns", + ne.end AS "End:ts_ns", + coalesce(ne.end, (SELECT m FROM maxts)) - ne.start AS "Duration:dur_ns", + CASE + WHEN d.name NOT NULL AND sid.value IS NOT NULL + THEN d.name || ':' || sid.value + WHEN d.name NOT NULL AND sid.value IS NULL + THEN d.name || ':' || ne.text + WHEN d.name IS NULL AND sid.value NOT NULL + THEN ':' || sid.value + ELSE ne.text + END AS "Name", + ne.jsonText, + (ne.globalTid / 0x1000000 % 0x1000000) as PID, + (ne.globalTid % 0x1000000) as TID + FROM + NVTX_EVENTS AS ne + LEFT OUTER JOIN + domains AS d + ON ne.domainId == d.id + AND (ne.globalTid & 0x0000FFFFFF000000) == (d.globalTid & 0x0000FFFFFF000000) + LEFT OUTER JOIN + StringIds AS sid + ON ne.textId == sid.id + WHERE + ne.eventType == 59 + OR + ne.eventType == 70 + ) +SELECT + * + FROM + nvtx \ No newline at end of file diff --git a/nsys2prv/scripts/nvtx_pushpop_trace.sql b/nsys2prv/scripts/nvtx_pushpop_trace.sql new file mode 100644 index 0000000..476d119 --- /dev/null +++ b/nsys2prv/scripts/nvtx_pushpop_trace.sql @@ -0,0 +1,52 @@ +WITH RECURSIVE + tree AS ( + SELECT + p.rangeId AS rangeId, + ':' || CAST(p.rangeId AS TEXT) AS rangeIdHier, + p.parentId AS parentId, + 0 AS level, + '' AS tab + FROM + temp.NVTX_PARENT AS p + WHERE p.parentId IS NULL + + UNION ALL + SELECT + p.rangeId AS rangeId, + tree.rangeIdHier || ':' || CAST(p.rangeId AS TEXT) AS rangeIdHier, + p.parentId AS parentId, + tree.level + 1 AS level, + tree.tab || '--' AS tab + FROM + tree + JOIN + temp.NVTX_PARENT AS p + ON p.parentId == tree.rangeId + + ORDER BY level DESC + ) +SELECT + ne.start AS "Start:ts_ns", + ne.start + p.duration AS "End:ts_ns", + p.duration AS "Duration:dur_ns", + ifnull(p.childDuration, 0) AS "DurChild:dur_ns", + p.duration - ifnull(p.childDuration, 0) AS "DurNonChild:dur_ns", + p.fullname AS "Name", + (ne.globalTid >> 24) & 0x00FFFFFF AS "PID", + ne.globalTid & 0x00FFFFFF AS "TID", + t.level AS "Lvl", + ifnull(p.childNumb, 0) AS "NumChild", + ne.rowid AS "RangeId", + t.parentId AS "ParentId", + t.rangeIdHier AS "RangeStack", + t.tab || p.fullname AS "NameTree", + ne.jsonText +FROM + NVTX_EVENTS AS ne +JOIN + temp.NVTX_PARENT AS p + ON p.rangeId == ne.rowid +JOIN + tree AS t + ON t.rangeId == ne.rowid +ORDER BY 1, 3 \ No newline at end of file diff --git a/nsys2prv/scripts/nvtx_pushpop_trace_prepare.sql b/nsys2prv/scripts/nvtx_pushpop_trace_prepare.sql new file mode 100644 index 0000000..502087c --- /dev/null +++ b/nsys2prv/scripts/nvtx_pushpop_trace_prepare.sql @@ -0,0 +1,130 @@ +DROP TABLE IF EXISTS temp.NVTX_EVENTS_MINMAXTS; + +CREATE TEMP TABLE NVTX_EVENTS_MINMAXTS +AS SELECT + min(min(start), min(end)) AS min, + max(max(start), max(end)) AS max +FROM NVTX_EVENTS +WHERE + eventType == 59 + OR eventType == 70; + + +DROP TABLE IF EXISTS temp.NVTX_EVENTS_RIDX; + + +CREATE VIRTUAL TABLE temp.NVTX_EVENTS_RIDX +USING rtree +( + rangeId, + startTS, + endTS, + +startNS INTEGER, + +endNS INTEGER, + +tid INTEGER +); + +INSERT INTO temp.NVTX_EVENTS_RIDX + SELECT + e.rowid AS rangeId, + rtree_scale(e.start, + (SELECT min FROM temp.NVTX_EVENTS_MINMAXTS), + (SELECT max FROM temp.NVTX_EVENTS_MINMAXTS)) AS startTS, + rtree_scale(ifnull(e.end, (SELECT max FROM temp.NVTX_EVENTS_MINMAXTS)), + (SELECT min FROM temp.NVTX_EVENTS_MINMAXTS), + (SELECT max FROM temp.NVTX_EVENTS_MINMAXTS)) AS endTS, + e.start AS startNS, + ifnull(e.end, (SELECT max FROM temp.NVTX_EVENTS_MINMAXTS)) AS endNS, + e.globalTid AS tid + FROM + NVTX_EVENTS AS e + WHERE + e.eventType == 59 + OR e.eventType == 70; + +DROP TABLE IF EXISTS temp.NVTX_PARENT; + + +CREATE TEMP TABLE NVTX_PARENT ( + rangeId INTEGER PRIMARY KEY NOT NULL, + parentId INTEGER, + duration INTEGER, + childDuration INTEGER, + childNumb INTEGER, + fullname TEXT +); + +INSERT INTO temp.NVTX_PARENT + WITH + domains AS ( + SELECT + min(ne.start), + ne.domainId AS id, + ne.globalTid AS globalTid, + coalesce(sid.value, ne.text) AS name + FROM + NVTX_EVENTS AS ne + LEFT JOIN + StringIds AS sid + ON ne.textId == sid.id + WHERE + ne.eventType == 75 + GROUP BY 2, 3 + ) + SELECT + ne.rowid AS rangeId, + NULL AS parentId, + ifnull(ne.end, (SELECT max FROM temp.NVTX_EVENTS_MINMAXTS)) - ne.start AS duration, + 0 AS childDuration, + 0 AS childNumb, + coalesce(d.name, '') || ':' || coalesce(sid.value, ne.text, '') AS fullname + + FROM + NVTX_EVENTS AS ne + LEFT JOIN + domains AS d + ON ne.domainId == d.id + AND (ne.globalTid & 0x0000FFFFFF000000) == (d.globalTid & 0x0000FFFFFF000000) + LEFT JOIN + StringIds AS sid + ON ne.textId == sid.id + WHERE + ne.eventType == 59 + OR ne.eventType == 70; + +UPDATE temp.NVTX_PARENT SET parentId = child.pid +FROM ( + SELECT + cr.rangeId as cid, + pr.rangeId as pid, + min((cr.startNS - pr.startNS) + (pr.endNS - cr.EndNS)) as tightness + FROM + temp.NVTX_EVENTS_RIDX AS cr + JOIN + temp.NVTX_EVENTS_RIDX AS pr + ON + pr.rangeId != cr.rangeId + AND pr.startTS <= cr.startTS + AND pr.endTS >= cr.endTS + AND pr.startNS <= cr.startNS + AND pr.endNS >= cr.endNS + AND pr.tid == cr.tid + GROUP BY cid +) AS child +WHERE temp.NVTX_PARENT.rangeId = child.cid; + +UPDATE temp.NVTX_PARENT + SET (childDuration, childNumb) = (totals.cDur, totals.cNum) +FROM ( + SELECT + parentId AS pId, + total(duration) AS cDur, + count(*) AS cNum + FROM + temp.NVTX_PARENT + GROUP BY 1 +) AS totals +WHERE temp.NVTX_PARENT.rangeId == totals.pId; + +CREATE INDEX IF NOT EXISTS temp.NVTX_PARENT__PARENTID + ON NVTX_PARENT (parentId); \ No newline at end of file diff --git a/nsys2prv/scripts/nvtx_startend_trace.sql b/nsys2prv/scripts/nvtx_startend_trace.sql index c52b66a..8d43232 100644 --- a/nsys2prv/scripts/nvtx_startend_trace.sql +++ b/nsys2prv/scripts/nvtx_startend_trace.sql @@ -26,7 +26,7 @@ WITH WHEN d.name NOT NULL AND sid.value IS NULL THEN d.name || ':' || ne.text WHEN d.name IS NULL AND sid.value NOT NULL - THEN sid.value + THEN ':' || sid.value ELSE ne.text END AS tag, (ne.globalTid / 0x1000000 % 0x1000000) as Pid, diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py index 1976280..5c7c578 100644 --- a/nsys2prv/semantics/__init__.py +++ b/nsys2prv/semantics/__init__.py @@ -2,4 +2,6 @@ from .kernels_semantic import KernelsSemantic from .mpi_semantic import * from .nvtx_startend_semantic import NVTXStartEndSemantic from .gpu_metrics_semantic import GPUMetricsSemantic -from .openacc_semantic import * \ No newline at end of file +from .openacc_semantic import * +from .nvtx_pushpop_semantic import NVTXPushPopSemantic +from .nvtx_pushpop_simple_semantic import NVTXPushPopSimpleSemantic \ No newline at end of file diff --git a/nsys2prv/semantics/nsys_event.py b/nsys2prv/semantics/nsys_event.py index 9813a14..c709eaf 100644 --- a/nsys2prv/semantics/nsys_event.py +++ b/nsys2prv/semantics/nsys_event.py @@ -23,6 +23,7 @@ class NsysEvent: self._dbfile = f"{os.path.splitext(report)[0]}.sqlite" self._df = pd.DataFrame() self._empty = False + self.prepare_statements = [] if not os.path.exists(self._dbfile): raise self.MissingDatabaseFile(self._dbfile) @@ -49,6 +50,10 @@ class NsysEvent: def load_data(self): if not self._empty: try: + if len(self.prepare_statements) > 0: + cursor = self._dbcon.raw_connection().cursor() + for statement in self.prepare_statements: + cursor.execute(statement) self._df = pd.read_sql_query(self.query, self._dbcon) except pd.errors.DatabaseError: raise self.InvalidSQL(self.query) diff --git a/nsys2prv/semantics/nvtx_pushpop_semantic.py b/nsys2prv/semantics/nvtx_pushpop_semantic.py new file mode 100644 index 0000000..5b0060c --- /dev/null +++ b/nsys2prv/semantics/nvtx_pushpop_semantic.py @@ -0,0 +1,20 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class NVTXPushPopSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_pushpop_trace_prepare.sql'), 'r') as query: + for statement in query.read().split(';'): + if len(statement.strip()) > 0: + self.prepare_statements.append(statement) + with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_pushpop_trace.sql'), 'r') as query: + self.query = text(query.read()) + + def _preprocess(self): + self._df["domain"] = self._df["Name"].str.split(":").str[0] + self._df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + return super()._preprocess() \ No newline at end of file diff --git a/nsys2prv/semantics/nvtx_pushpop_simple_semantic.py b/nsys2prv/semantics/nvtx_pushpop_simple_semantic.py new file mode 100644 index 0000000..3e2e590 --- /dev/null +++ b/nsys2prv/semantics/nvtx_pushpop_simple_semantic.py @@ -0,0 +1,16 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class NVTXPushPopSimpleSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_pushpop_simple.sql'), 'r') as query: + self.query = text(query.read()) + + def _preprocess(self): + self._df["domain"] = self._df["Name"].str.split(":").str[0] + self._df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + return super()._preprocess() \ No newline at end of file diff --git a/nsys2prv/semantics/nvtx_startend_semantic.py b/nsys2prv/semantics/nvtx_startend_semantic.py index 88241b7..f339215 100644 --- a/nsys2prv/semantics/nvtx_startend_semantic.py +++ b/nsys2prv/semantics/nvtx_startend_semantic.py @@ -8,4 +8,8 @@ class NVTXStartEndSemantic(NsysEvent): def Setup(self): with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_startend_trace.sql'), 'r') as query: - self.query = text(query.read()) \ No newline at end of file + self.query = text(query.read()) + + def _preprocess(self): + self._df["domain"] = self._df["Name"].str.split(":").str[0] + return super()._preprocess() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b85d43f..e838f9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "nsys2prv" -version = "0.4.0-dev20241107b" +version = "0.4.0-dev20241118" description = "Translate a NVIDIA Nsight System trace to a Paraver trace" authors = ["Marc ClascĂ  "] readme = "README.md" -- GitLab From fedd11dedbef445889568ce171ea923059bfaa45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Mon, 18 Nov 2024 15:49:11 +0100 Subject: [PATCH 2/3] Solves bug when identifying NVTX domains and separating them in events --- cfgs/cuda_activities.cfg | 18 +++++----- cfgs/nccl-kernels-tasklevel.cfg | 45 +++++++++--------------- nsys2prv/parse_nsys_stats.py | 6 ++-- nsys2prv/scripts/nvtx_pushpop_simple.sql | 2 +- 4 files changed, 29 insertions(+), 42 deletions(-) diff --git a/cfgs/cuda_activities.cfg b/cfgs/cuda_activities.cfg index dd02a02..43852e3 100644 --- a/cfgs/cuda_activities.cfg +++ b/cfgs/cuda_activities.cfg @@ -11,11 +11,11 @@ ConfigFile.EndDescription window_name CUDA Activity window_type single window_id 1 -window_position_x 714 -window_position_y 425 +window_position_x 777 +window_position_y 463 window_width 945 window_height 244 -window_comm_lines_enabled true +window_comm_lines_enabled false window_flags_enabled false window_noncolor_mode true window_custom_color_enabled false @@ -26,14 +26,14 @@ window_comm_fromto true window_comm_tagsize true window_comm_typeval true window_units Nanoseconds -window_maximum_y 159.000000000000 +window_maximum_y 66.000000000000 window_minimum_y 1.000000000000 window_compute_y_max false window_level thread -window_scale_relative 0.0000000 -window_end_time_relative 1.0000000 +window_scale_relative 1.000000000000 +window_end_time_relative 1.000000000000 window_object appl { 1, { All } } -window_begin_time_relative 0.00000 +window_begin_time_relative 0.000000000000 window_open true window_drawmode draw_maximum window_drawmode_rows draw_last @@ -41,6 +41,6 @@ window_pixel_size 1 window_labels_to_draw 1 window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Adding}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, As Is}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } -window_filter_module evt_type 2 63000000 63000006 -window_filter_module evt_type_label 2 "CUDA library call" "CUDA kernel" +window_filter_module evt_type 3 63000000 63000006 63000007 +window_filter_module evt_type_label 3 "CUDA library call" "CUDA kernel" "NCCL kernel" diff --git a/cfgs/nccl-kernels-tasklevel.cfg b/cfgs/nccl-kernels-tasklevel.cfg index e344b04..faf80e6 100644 --- a/cfgs/nccl-kernels-tasklevel.cfg +++ b/cfgs/nccl-kernels-tasklevel.cfg @@ -2,8 +2,7 @@ ConfigFile.Version: 3.4 ConfigFile.NumWindows: 3 ConfigFile.BeginDescription -NCCL Kernels (AllReduce bf16, AllReduce fp16, AllGather) aggregated at task level. -Computed from Stream 20 with adding of two task level views of thread 8 and thread 7 + ConfigFile.EndDescription ################################################################################ @@ -34,23 +33,19 @@ window_level task window_scale_relative 0.320786501967 window_end_time_relative 0.320786501967 window_object appl { 1, { All } } -window_object thread { 0, 0, 10, { None, None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 1, 9, { None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 2, 9, { None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 3, 9, { None, None, None, None, None, None, All, None, None } } window_begin_time_relative 0.319172112748 window_open false window_drawmode draw_maximum window_drawmode_rows draw_last window_pixel_size 1 window_labels_to_draw 1 -window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Thread i}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, =} } } +window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Thread i}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, As Is}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } -window_semantic_module task Thread i { 1, { 1 7.000000000000 } } -window_filter_module evt_type 2 63000000 63000006 -window_filter_module evt_type_label 2 "CUDA library call" "CUDA kernel" +window_semantic_module task Thread i { 1, { 1 5.000000000000 } } +window_filter_module evt_type 1 63000007 +window_filter_module evt_type_label 1 "NCCL kernel" window_filter_module evt_value 4 0.000000000000 27.000000000000 28.000000000000 29.000000000000 -window_filter_module evt_value_label 4 "End" "ncclDevKernel_AllGather_RING_LL(ncclDevComm *, unsigned long, ncclWork *)" "ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevComm *, unsigned long, ncclWork *)" "ncclDevKernel_AllReduce_Sum_f32_RING_LL(ncclDevComm *, unsigned long, ncclWork *)" +window_filter_module evt_value_label 4 "End" "Unknown" "Unknown" "Unknown" window_synchronize 1 ################################################################################ @@ -81,23 +76,19 @@ window_level task window_scale_relative 0.320786501967 window_end_time_relative 0.320786501967 window_object appl { 1, { All } } -window_object thread { 0, 0, 10, { None, None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 1, 9, { None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 2, 9, { None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 3, 9, { None, None, None, None, None, None, All, None, None } } window_begin_time_relative 0.319172112748 window_open false window_drawmode draw_maximum window_drawmode_rows draw_last window_pixel_size 1 window_labels_to_draw 1 -window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Thread i}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, =} } } +window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Thread i}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, As Is}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } -window_semantic_module task Thread i { 1, { 1 8.000000000000 } } -window_filter_module evt_type 2 63000000 63000006 -window_filter_module evt_type_label 2 "CUDA library call" "CUDA kernel" +window_semantic_module task Thread i { 1, { 1 7.000000000000 } } +window_filter_module evt_type 1 63000007 +window_filter_module evt_type_label 1 "NCCL kernel" window_filter_module evt_value 4 0.000000000000 27.000000000000 28.000000000000 29.000000000000 -window_filter_module evt_value_label 4 "End" "ncclDevKernel_AllGather_RING_LL(ncclDevComm *, unsigned long, ncclWork *)" "ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevComm *, unsigned long, ncclWork *)" "ncclDevKernel_AllReduce_Sum_f32_RING_LL(ncclDevComm *, unsigned long, ncclWork *)" +window_filter_module evt_value_label 4 "End" "Unknown" "Unknown" "Unknown" window_synchronize 1 ################################################################################ @@ -110,8 +101,8 @@ window_factors 1.000000000000 1.000000000000 window_shifts 0 0 window_operation add window_identifiers 1 2 -window_position_x 464 -window_position_y 595 +window_position_x 537 +window_position_y 563 window_width 945 window_height 244 window_comm_lines_enabled false @@ -120,19 +111,15 @@ window_noncolor_mode true window_custom_color_enabled false window_semantic_scale_min_at_zero false window_units Nanoseconds -window_maximum_y 29.000000000000 -window_minimum_y 27.000000000000 +window_maximum_y 112.000000000000 +window_minimum_y 55.000000000000 window_compute_y_max false window_level task window_scale_relative 1.000000000000 window_end_time_relative 1.000000000000 window_object appl { 1, { All } } -window_object thread { 0, 0, 10, { None, None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 1, 9, { None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 2, 9, { None, None, None, None, None, None, All, None, None } } -window_object thread { 0, 3, 9, { None, None, None, None, None, None, All, None, None } } window_begin_time_relative 0.000000000000 -window_open false +window_open true window_drawmode draw_maximum window_drawmode_rows draw_last window_pixel_size 1 diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 6f34365..b17cfc0 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -1114,15 +1114,15 @@ GRADIENT_NAMES if t_nvtx: ewr(prv_file, nvtx_df_subset, "NVTX pushpop ranges", lambda r: - (create_event_record(r.iloc[0], r.iloc[2], int(r["thread"]), int(r["task"]), r["event_type"], r["event_value"]))) + (create_event_record(r["Start:ts_ns"], r["Duration:dur_ns"], int(r["thread"]), int(r["task"]), r["event_type"], r["event_value"]))) # NVTX NCCL regions, still missing nccl info if not nvtx_nccl_df.empty: ewr(prv_file, nvtx_nccl_df, "NVTX NCCL regions", lambda r: - (create_event_record(r.iloc[0], r.iloc[2], int(r["thread"]), int(r["task"]), r["event_type"], r["event_value"]))) + (create_event_record(r["Start:ts_ns"], r["Duration:dur_ns"], int(r["thread"]), int(r["task"]), r["event_type"], r["event_value"]))) if t_nvtx_startend: ewr(prv_file, nvtx_startend_df, "NVTX startend ranges", lambda r: - (create_event_record(r.iloc[0], r.iloc[2], int(r["thread"]), int(r["task"]), event_type_nvtx_startend, r["event_value"]))) + (create_event_record(r["start"], r["duration"], int(r["thread"]), int(r["task"]), event_type_nvtx_startend, r["event_value"]))) if t_mpi: diff --git a/nsys2prv/scripts/nvtx_pushpop_simple.sql b/nsys2prv/scripts/nvtx_pushpop_simple.sql index 241ad10..f85a4a9 100644 --- a/nsys2prv/scripts/nvtx_pushpop_simple.sql +++ b/nsys2prv/scripts/nvtx_pushpop_simple.sql @@ -27,7 +27,7 @@ WITH THEN d.name || ':' || ne.text WHEN d.name IS NULL AND sid.value NOT NULL THEN ':' || sid.value - ELSE ne.text + ELSE ':' || ne.text END AS "Name", ne.jsonText, (ne.globalTid / 0x1000000 % 0x1000000) as PID, -- GitLab From 379e48bed5b35c73055481e8493cee83182754cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Mon, 18 Nov 2024 16:50:21 +0100 Subject: [PATCH 3/3] Solves issue with reduction operation type --- cfgs/nccl_reduction_op.cfg | 46 ++++++++++++++++++++++++++++++++++++ nsys2prv/parse_nsys_stats.py | 12 ++++++---- 2 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 cfgs/nccl_reduction_op.cfg diff --git a/cfgs/nccl_reduction_op.cfg b/cfgs/nccl_reduction_op.cfg new file mode 100644 index 0000000..ca56a52 --- /dev/null +++ b/cfgs/nccl_reduction_op.cfg @@ -0,0 +1,46 @@ +#ParaverCFG +ConfigFile.Version: 3.4 +ConfigFile.NumWindows: 1 +ConfigFile.BeginDescription + +ConfigFile.EndDescription + +################################################################################ +< NEW DISPLAYING WINDOW NCCL reduction op > +################################################################################ +window_name NCCL reduction op +window_type single +window_id 1 +window_position_x 762 +window_position_y 340 +window_width 600 +window_height 115 +window_comm_lines_enabled false +window_flags_enabled false +window_noncolor_mode true +window_custom_color_enabled false +window_semantic_scale_min_at_zero false +window_logical_filtered true +window_physical_filtered false +window_comm_fromto true +window_comm_tagsize true +window_comm_typeval true +window_units Nanoseconds +window_maximum_y 0.000000000000 +window_minimum_y 0.000000000000 +window_compute_y_max false +window_level thread +window_scale_relative 1.000000000000 +window_end_time_relative 1.000000000000 +window_object appl { 1, { All } } +window_begin_time_relative 0.000000000000 +window_open true +window_drawmode draw_maximum +window_drawmode_rows draw_last +window_pixel_size 1 +window_labels_to_draw 1 +window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Adding}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } +window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, As Is}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } +window_filter_module evt_type 1 9502 +window_filter_module evt_type_label 1 "NCCL Reduction operation" + diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index b17cfc0..636284d 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -661,7 +661,7 @@ def main(): if t_nccl: nvtx_nccl_df["jsonText"] = nvtx_nccl_df["jsonText"].apply(json.loads) - json_expanded = pd.json_normalize(nvtx_nccl_df["jsonText"]).reset_index(drop=True).fillna(0) + json_expanded = pd.json_normalize(nvtx_nccl_df["jsonText"]).reset_index(drop=True) nvtx_nccl_df = pd.concat([nvtx_nccl_df.drop(columns=['jsonText']), json_expanded], axis=1) ## Create dictionary of events for generic payload @@ -676,11 +676,15 @@ def main(): non_numeric_nccl_payloads = {} for col in json_expanded.select_dtypes(include='object').columns: - unique_values = json_expanded[col].dropna().unique() + nvtx_nccl_df[f"{col}_value"] = nvtx_nccl_df.groupby(col, dropna=True).ngroup() + 1 + nvtx_nccl_df[f"{col}_value"] = nvtx_nccl_df[f"{col}_value"].fillna(0).apply(int) + unique_values = nvtx_nccl_df[[col, f"{col}_value"]].dropna().drop_duplicates() + #unique_values = json_expanded[col].dropna().unique() non_numeric_nccl_payloads[col] = pd.DataFrame({ - f"event_name": unique_values, - f"event_value": range(1, len(unique_values) + 1) + f"event_name": unique_values[col].tolist(), + f"event_value": unique_values[f"{col}_value"].tolist() }) + nvtx_nccl_df[col] = nvtx_nccl_df[f"{col}_value"] # Now recurring domains, starting with nesmik -- GitLab