diff --git a/cfgs/nvtx_startend.cfg b/cfgs/nvtx_startend.cfg new file mode 100644 index 0000000000000000000000000000000000000000..b88ebffa006fc42f4313714dec81fe401a36d29b --- /dev/null +++ b/cfgs/nvtx_startend.cfg @@ -0,0 +1,46 @@ +#ParaverCFG +ConfigFile.Version: 3.4 +ConfigFile.NumWindows: 1 +ConfigFile.BeginDescription + +ConfigFile.EndDescription + +################################################################################ +< NEW DISPLAYING WINDOW NVTX StartEnd Ranges > +################################################################################ +window_name NVTX StartEnd Ranges +window_type single +window_id 1 +window_position_x 671 +window_position_y 275 +window_width 922 +window_height 165 +window_comm_lines_enabled false +window_flags_enabled false +window_noncolor_mode true +window_custom_color_enabled false +window_semantic_scale_min_at_zero false +window_logical_filtered true +window_physical_filtered false +window_comm_fromto true +window_comm_tagsize true +window_comm_typeval true +window_units Nanoseconds +window_maximum_y 52.000000000000 +window_minimum_y 1.000000000000 +window_compute_y_max false +window_level thread +window_scale_relative 1.000000000000 +window_end_time_relative 1.000000000000 +window_object appl { 1, { All } } +window_begin_time_relative 0.000000000000 +window_open false +window_drawmode draw_maximum +window_drawmode_rows draw_last +window_pixel_size 1 +window_labels_to_draw 1 +window_selected_functions { 14, { {cpu, Active Thd}, {appl, Adding}, {task, Adding}, {thread, Last Evt Val}, {node, Adding}, {system, Adding}, {workload, Adding}, {from_obj, All}, {to_obj, All}, {tag_msg, All}, {size_msg, All}, {bw_msg, All}, {evt_type, =}, {evt_value, All} } } +window_compose_functions { 9, { {compose_cpu, As Is}, {compose_appl, As Is}, {compose_task, As Is}, {compose_thread, Stacked Val}, {compose_node, As Is}, {compose_system, As Is}, {compose_workload, As Is}, {topcompose1, As Is}, {topcompose2, As Is} } } +window_filter_module evt_type 1 9004 +window_filter_module evt_type_label 1 "Unknown" + diff --git a/nvtx_startend_test.ipynb b/nvtx_startend_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..89e3b33175871adc3dfb3b9c22d5fe37aba65f86 --- /dev/null +++ b/nvtx_startend_test.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import time\n", + "import subprocess\n", + "import os\n", + "import locale\n", + "import sqlite3\n", + "from sqlalchemy import create_engine\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "NSIGHT_HOME=\"/home/mclasca/Apps/nsight-system/2024.1\"\n", + "#NSIGHT_HOME = os.getenv('NSIGHT_HOME')\n", + "PARAVER_HOME = os.getenv('PARAVER_HOME')\n", + "NVTX_RANGE=\"step53\"\n", + "#REPORT_FILE = os.path.abspath(\"/home/mclasca/Documents/BePPP/heka/profiles/mistral-mn5/heka-axolotl-Mistral7B0.1-4s_withmetrics-2432719.nsys-rep\")\n", + "#REPORT_FILE = os.path.abspath(\"/home/mclasca/Documents/BePPP/traces/xshells/nsys/xshells.par.medium-1N_withmetrics.nsys-rep\")\n", + "REPORT_FILE = os.path.abspath(\"/home/mclasca/Documents/BePPP/traces/jesus/drive-download-20240628T123341Z-001/long-short.nsys-rep\")\n", + "REPORT_DIR = os.path.dirname(REPORT_FILE)\n", + "#REPORT_NAME=\"heka-step53+accum1-profile-2023.4-5721957\"\n", + "#REPORT_NAME=\"heka-axolotl-Mistral7B0.1-profile-2110598\"\n", + "\n", + "locale.setlocale(locale.LC_ALL, '')\n", + "\n", + "trace_name = \"test-xshells-metrics\"\n", + "event_type_kernels = 63000006\n", + "event_type_memcopy_size = 63000002\n", + "event_type_api = 63000000\n", + "event_type_nvtx = 9003\n", + "event_type_nvtx_startend = 9004\n", + "event_type_blkgrd_name = 9100\n", + "event_types_block_grid_values = [9101, 9102, 9103, 9104, 9105, 9106]\n", + "event_types_block_grid_values_names = ['GrdX', 'GrdY', 'GrdZ', 'BlkX', 'BlkY', 'BlkZ']\n", + "event_type_registers_thread = 9107\n", + "event_type_correlation = 9200\n", + "event_type_mpi = 9300\n", + "event_type_metrics_base = 9400\n", + "\n", + "comm_tag_launch = 55001\n", + "comm_tag_memory = 55002\n", + "comm_tag_dependency = 55003\n", + "\n", + "event_type_openacc = 66000000\n", + "event_type_openacc_data = 66000001\n", + "event_type_openacc_launch = 66000002\n", + "\n", + "event_type_name_openacc = 66100000\n", + "event_type_name_openacc_data = 66100001\n", + "event_type_name_openacc_launch = 66100002\n", + "\n", + "event_type_func_openacc = 66200000\n", + "event_type_func_openacc_data = 66200001\n", + "event_type_func_openacc_launch = 66200002\n", + "\n", + "event_type_openacc_data_size = 66300001\n", + "\n", + "nvtx_select_frames = True\n", + "nvtx_stack_top = 1\n", + "nvtx_stack_bottom = 4\n", + "\n", + "t_openacc = True\n", + "\n", + "reports = [\"nvtx_pushpop_trace\", \"cuda_api_trace\", \"cuda_gpu_trace\"]\n", + "\n", + "def build_nsys_stats_name(report_name):\n", + " base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]\n", + " return os.path.join(REPORT_DIR, base_name+\"_{}.csv\".format(report_name))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "engine = create_engine(f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n", + "with engine.connect() as conn, conn.begin():\n", + " with open(os.path.join(os.path.dirname(__file__), 'scripts/nvtx_startend_trace.sql'), 'r') as query:\n", + " # connection == the connection to your database, in your case prob_db\n", + " nvtx_startend_ranges = pd.read_sql_query(query.read(), conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
startenddurationtagPidTid
015875509953181539186052278408652Prefill Transformer1190611906
1182492894041831705359667764192Decode Transformer1190611906
2183186131451837035754151744396Decode Transformer1190611906
3183718006311841919881747398186Decode Transformer1190611906
4184204759181846922136448745446Decode Transformer1190611906
5184705162061851905451348538307Decode Transformer1190611906
6185203486241856699457646645952Decode Transformer1190611906
7185682576171861613367047876053Decode Transformer1190611906
8186174198611866520933447789473Decode Transformer1190611906
9186665317251871745652650924801Decode Transformer1190611906
10187187330771876640573147672654Decode Transformer1190611906
11187676861921881727626349590071Decode Transformer1190611906
12188185857241886742647048840746Decode Transformer1190611906
13188687096011891789604449186443Decode Transformer1190611906
14189192109951897034523551134240Decode Transformer1190611906
15189716788851902143738549758500Decode Transformer1190611906
\n", + "
" + ], + "text/plain": [ + " start end duration tag Pid Tid\n", + "0 15875509953 18153918605 2278408652 Prefill Transformer 11906 11906\n", + "1 18249289404 18317053596 67764192 Decode Transformer 11906 11906\n", + "2 18318613145 18370357541 51744396 Decode Transformer 11906 11906\n", + "3 18371800631 18419198817 47398186 Decode Transformer 11906 11906\n", + "4 18420475918 18469221364 48745446 Decode Transformer 11906 11906\n", + "5 18470516206 18519054513 48538307 Decode Transformer 11906 11906\n", + "6 18520348624 18566994576 46645952 Decode Transformer 11906 11906\n", + "7 18568257617 18616133670 47876053 Decode Transformer 11906 11906\n", + "8 18617419861 18665209334 47789473 Decode Transformer 11906 11906\n", + "9 18666531725 18717456526 50924801 Decode Transformer 11906 11906\n", + "10 18718733077 18766405731 47672654 Decode Transformer 11906 11906\n", + "11 18767686192 18817276263 49590071 Decode Transformer 11906 11906\n", + "12 18818585724 18867426470 48840746 Decode Transformer 11906 11906\n", + "13 18868709601 18917896044 49186443 Decode Transformer 11906 11906\n", + "14 18919210995 18970345235 51134240 Decode Transformer 11906 11906\n", + "15 18971678885 19021437385 49758500 Decode Transformer 11906 11906" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nvtx_startend_ranges" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tagevent_value
1Decode Transformer1
0Prefill Transformer2
\n", + "
" + ], + "text/plain": [ + " tag event_value\n", + "1 Decode Transformer 1\n", + "0 Prefill Transformer 2" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nvtx_startend_ranges[\"event_value\"] = nvtx_startend_ranges.groupby([\"tag\"]).ngroup() + 1\n", + "nvtx_startend_names = nvtx_startend_ranges[['tag', 'event_value']].drop_duplicates()\n", + "nvtx_startend_names.sort_values(\"event_value\", inplace=True)\n", + "nvtx_startend_names" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/parse-nsys-stats.py b/parse-nsys-stats.py index 651a4476a669022f38be734330b6e3428c108e61..ba94b6cb46a10db46598759b6feb153c149c53a7 100755 --- a/parse-nsys-stats.py +++ b/parse-nsys-stats.py @@ -11,6 +11,7 @@ import subprocess import os import locale import sqlite3 +from sqlalchemy import create_engine locale.setlocale(locale.LC_ALL, '') @@ -19,7 +20,7 @@ parser = argparse.ArgumentParser(description="Convert a NVIDIA Nsight System tra epilog="The environment variables NSIGHT_HOME and PARAVER_HOME are needed") parser.add_argument("-f", "--filter-nvtx", help="Filter by this NVTX range") -parser.add_argument("-t", "--trace", required=True, help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, cuda_api_trace, cuda_gpu_trace, gpu_metrics]") +parser.add_argument("-t", "--trace", required=True, help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, cuda_gpu_trace, gpu_metrics]") parser.add_argument("--force-sqlite", action="store_true", help="Force Nsight System to export SQLite database") @@ -56,6 +57,7 @@ NVTX_RANGE = args.filter_nvtx reports = args.trace.split(",") t_nvtx = False +t_nvtx_startend = False t_kernels = False t_apicalls = False t_mpi = False @@ -68,12 +70,16 @@ if "mpi_event_trace" in reports: t_mpi = True if "gpu_metrics" in reports: t_metrics = True reports.remove("gpu_metrics") +if "nvtx_startend_trace" in reports: + t_nvtx_startend = True + reports.remove("nvtx_startend_trace") #trace_name = "llava_cesga" event_type_kernels = 63000006 event_type_memcopy_size = 63000002 event_type_api = 63000000 event_type_nvtx = 9003 +event_type_nvtx_startend = 9004 event_type_blkgrd_name = 9100 event_types_block_grid_values = [9101, 9102, 9103, 9104, 9105, 9106] event_types_block_grid_values_names = ['GrdX', 'GrdY', 'GrdZ', 'BlkX', 'BlkY', 'BlkZ'] @@ -86,7 +92,7 @@ comm_tag_launch = 55001 comm_tag_memory = 55002 comm_tag_dependency = 55003 -nvtx_select_frames = True +nvtx_select_frames = False nvtx_stack_top = 1 nvtx_stack_bottom = 4 @@ -141,6 +147,15 @@ if t_nvtx: else: nvtx_df = pd.DataFrame() +if t_nvtx_startend: + engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") + with engine.connect() as conn, conn.begin(): + with open(os.path.join(os.path.dirname(__file__), 'scripts/nvtx_startend_trace.sql'), 'r') as query: + # connection == the connection to your database, in your case prob_db + nvtx_startend_df = pd.read_sql_query(query.read(), conn) +else: + nvtx_startend_df = pd.DataFrame() + if t_mpi: mpi_df = pd.read_csv(build_nsys_stats_name("mpi_event_trace")) else: @@ -171,6 +186,7 @@ if t_metrics: if t_apicalls: print("CUDA calls unique processes: {}, and unique threads: {}".format(cuda_api_df["Pid"].unique(), cuda_api_df["Tid"].unique())) if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["PID"].unique(), nvtx_df["TID"].unique())) +if t_nvtx_startend: print("NVTX startend unique processes: {}, and unique threads: {}".format(nvtx_startend_df["Pid"].unique(), nvtx_startend_df["Tid"].unique())) if t_mpi: print("MPI ranges unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique())) if t_nvtx: nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) @@ -178,8 +194,10 @@ if t_nvtx: nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) compute_threads_with = [] if t_apicalls: compute_threads_with.append(cuda_api_df[['Pid', 'Tid']]) if t_nvtx: compute_threads_with.append(nvtx_df[["Pid", "Tid"]]) +if t_nvtx_startend: compute_threads_with.append(nvtx_startend_df[["Pid", "Tid"]]) if t_mpi: compute_threads_with.append(mpi_df[["Pid", "Tid"]]) + threads = pd.concat(compute_threads_with).drop_duplicates() threads.sort_values(["Pid"], inplace=True) threads["thread"] = threads.groupby(["Pid"]).cumcount() + 1 @@ -197,6 +215,8 @@ cuda_api_df["thread"] = 0 cuda_api_df["task"] = 0 nvtx_df["thread"] = 0 nvtx_df["task"] = 0 +nvtx_startend_df["thread"] = 0 +nvtx_startend_df["task"] = 0 mpi_df["thread"] = 0 mpi_df["task"] = 0 @@ -218,6 +238,10 @@ if t_nvtx: nvtx_df["thread"] = nvtx_df["Tid"].map(threads.set_index('Tid')["thread"]) nvtx_df["task"] = nvtx_df["Tid"].map(threads.set_index('Tid')["task"]) +if t_nvtx_startend: + nvtx_startend_df["thread"] = nvtx_startend_df["Tid"].map(threads.set_index('Tid')["thread"]) + nvtx_startend_df["task"] = nvtx_startend_df["Tid"].map(threads.set_index('Tid')["task"]) + if t_mpi: mpi_df["thread"] = mpi_df["Tid"].map(threads.set_index('Tid')["thread"]) mpi_df["task"] = mpi_df["Tid"].map(threads.set_index('Tid')["task"]) @@ -335,6 +359,12 @@ if t_nvtx: ranges_names = nvtx_df_subset[['event_value', 'Name']].drop_duplicates() ranges_names.sort_values("event_value", inplace=True) +if t_nvtx_startend: + nvtx_startend_df["event_value"] = nvtx_startend_df.groupby(["tag"]).ngroup() + 1 + nvtx_startend_names = nvtx_startend_df[['tag', 'event_value']].drop_duplicates() + nvtx_startend_names.sort_values("event_value", inplace=True) + nvtx_startend_names + print("-\tWriting pcf file...") @@ -437,13 +467,22 @@ if t_metrics: if t_nvtx: pcf_file.write("EVENT_TYPE\n") - pcf_file.write("0 {} NVTX ranges\n".format(event_type_nvtx)) + pcf_file.write("0 {} NVTX pushpop ranges\n".format(event_type_nvtx)) pcf_file.write("VALUES\n") pcf_file.write("0 End\n") for index, row in ranges_names.iterrows(): pcf_file.write("{} {}\n".format(row["event_value"], row["Name"])) pcf_file.write("\n") +if t_nvtx_startend: + pcf_file.write("EVENT_TYPE\n") + pcf_file.write("0 {} NVTX startend ranges\n".format(event_type_nvtx_startend)) + pcf_file.write("VALUES\n") + pcf_file.write("0 End\n") + for index, row in nvtx_startend_names.iterrows(): + pcf_file.write("{} {}\n".format(row["event_value"], row["tag"])) + pcf_file.write("\n") + pcf_file.close() # # Split of kernel execution between compute and memory @@ -508,6 +547,7 @@ applist = applist + ")" compute_max_with = [] if t_apicalls: compute_max_with.append((cuda_api_df["Start (ns)"] + cuda_api_df["Duration (ns)"]).max()) if t_nvtx: compute_max_with.append(nvtx_df["End (ns)"].max()) +if t_nvtx_startend: compute_max_with.append(nvtx_startend_df["end"].max()) if t_mpi: compute_max_with.append(mpi_df["End (ns)"].max()) ftime = max(compute_max_with) @@ -547,12 +587,19 @@ if t_apicalls: chunk = "" if t_nvtx: - print("-\tWriting NVTX ranges...") + print("-\tWriting NVTX pushpop ranges...") for index, row in nvtx_df_subset.iterrows(): chunk += create_event_record(row.iloc[0], row.iloc[2], int(row["thread"]), int(row["task"]), event_type_nvtx, row["event_value"]) prv_file.write(chunk) chunk = "" +if t_nvtx_startend: + print("-\tWriting NVTX startend ranges...") + for index, row in nvtx_startend_df.iterrows(): + chunk += create_event_record(row.iloc[0], row.iloc[2], int(row["thread"]), int(row["task"]), event_type_nvtx_startend, row["event_value"]) + prv_file.write(chunk) + chunk = "" + if t_mpi: print("-\tWriting MPI events...") for index, row in mpi_df.iterrows(): diff --git a/scripts/nvtx_startend_trace.sql b/scripts/nvtx_startend_trace.sql new file mode 100644 index 0000000000000000000000000000000000000000..c52b66ab41845276a2c187401b4da0869b03f5ba --- /dev/null +++ b/scripts/nvtx_startend_trace.sql @@ -0,0 +1,51 @@ +WITH + domains AS ( + SELECT + min(start), + domainId AS id, + globalTid AS globalTid, + text AS name + FROM + NVTX_EVENTS + WHERE + eventType == 75 + GROUP BY 2, 3 + ), + maxts AS( + SELECT max(max(start), max(end)) AS m + FROM NVTX_EVENTS + ), + nvtx AS ( + SELECT + ne.start as start, + ne.end as end, + coalesce(ne.end, (SELECT m FROM maxts)) - ne.start AS duration, + CASE + WHEN d.name NOT NULL AND sid.value IS NOT NULL + THEN d.name || ':' || sid.value + WHEN d.name NOT NULL AND sid.value IS NULL + THEN d.name || ':' || ne.text + WHEN d.name IS NULL AND sid.value NOT NULL + THEN sid.value + ELSE ne.text + END AS tag, + (ne.globalTid / 0x1000000 % 0x1000000) as Pid, + (ne.globalTid % 0x1000000) as Tid + FROM + NVTX_EVENTS AS ne + LEFT OUTER JOIN + domains AS d + ON ne.domainId == d.id + AND (ne.globalTid & 0x0000FFFFFF000000) == (d.globalTid & 0x0000FFFFFF000000) + LEFT OUTER JOIN + StringIds AS sid + ON ne.textId == sid.id + WHERE + ne.eventType == 60 + OR + ne.eventType == 71 + ) +SELECT + * + FROM + nvtx \ No newline at end of file