diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000000000000000000000000000000000..95c3117c7c10c8bf7719b6404e5b50edf5fe09fb --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,32 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "nsys2prv/parse_nsys_stats.py", + "console": "integratedTerminal", + "args": [ + "-t", + "cuda_api_trace,mpi_event_trace,gpu_metrics", + "-m", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_0.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_1.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_2.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_3.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_4.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_5.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_6.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_7.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_ricardo_metrics_4nodes_more" + ], + "env": { + "NSYS_HOME": "/home/mclasca/Apps/nsight-system/2024.5.1" + } + } + ] +} \ No newline at end of file diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py new file mode 100644 index 0000000000000000000000000000000000000000..9034cf7debdc4f0566a65a8f004add68eb9c6425 --- /dev/null +++ b/nsys2prv/NSYSInterface.py @@ -0,0 +1,67 @@ +import subprocess +import os + +class NSYSInterface(): + + def __init__(self, types, filter_nvtx, range_nvtx, force_sqlite): + self.use_path = True + self.nsys_binary = ("nsys",) + + if 'NSYS_HOME' in os.environ: + self.NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME')) + self.use_path = False + + if self.use_path: + self.nsys_binary = ("nsys",) + else: + self.nsys_binary = (os.path.join(self.NSYS_HOME, "bin/nsys"),) + + self.types = types + self.filter = filter_nvtx + self.range_nvtx = range_nvtx + self.force = force_sqlite + + def check_export_report(self, rf): + if not os.path.exists(f"{os.path.splitext(os.path.basename(rf))[0]}.sqlite") or self.force: + #Try exporting first + export_call = self.nsys_binary + ("export", "-t", "sqlite", rf) + try: + with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: + for line in p.stdout: + print(line.decode(), end='') + + if p.returncode != 0: + raise ChildProcessError(p.returncode, p.args) + except FileNotFoundError: + print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") + exit(1) + except ChildProcessError: + print("Could not export SQLite database. Exiting.") + exit(1) + + def call_stats(self, report): + nsys_call = self.nsys_binary + ("stats", "-r", ",".join(self.types), + "--timeunit", "nsec", "-f", "csv", + "--force-overwrite", "true", "-o", ".") + if self.filter: + nsys_call += ("--filter-nvtx="+self.range_nvtx,) + + nsys_call += (report,) + + try: + with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: + for line in p.stdout: + print(line.decode(), end='') + + if p.returncode != 0: + raise ChildProcessError(p.returncode, p.args) + except FileNotFoundError: + print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") + exit(1) + + def build_nsys_stats_name(self, rf, rd, report_name): + base_name = os.path.splitext(os.path.basename(rf))[0] + if self.filter: + return os.path.join(rd, base_name+"_{}_nvtx={}.csv".format(report_name, self.range_nvtx)) + else: + return os.path.join(rd, base_name+"_{}.csv".format(report_name)) \ No newline at end of file diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 8a2566008c06d800d145b6de8ee76b67188eabc0..48858feb37289aee14406ae9e14917c704774e04 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -8,11 +8,13 @@ import time import subprocess import os import locale +from functools import reduce from sqlalchemy import create_engine, text, dialects from sqlalchemy.exc import OperationalError from .EventWriter import event_writer as ewr +from .NSYSInterface import NSYSInterface from .semantics.mpi_event_encoding import * - +from .semantics import * def main(): locale.setlocale(locale.LC_ALL, '') @@ -31,6 +33,7 @@ def main(): parser.add_argument("-v", "--version", nargs=0, help="Show version and exit.", action=ShowVersion) parser.add_argument("-f", "--filter-nvtx", help="Filter by this NVTX range") parser.add_argument("-t", "--trace", help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, gpu_metrics, openacc]") + parser.add_argument("-m", "--multi-report", action="store_true", help="Translate multiple reports of the same execution into one trace.") parser.add_argument("--force-sqlite", action="store_true", help="Force Nsight System to export SQLite database") @@ -39,23 +42,24 @@ def main(): #parser.add_argument("-n", "--nvtx-stack-range", nargs=2, type=int) - parser.add_argument("source_rep", help="Nsight source report file") + parser.add_argument("source_rep", nargs="+", help="Nsight source report file") parser.add_argument("output", help="Paraver output trace name") args = parser.parse_args() # # Trace configuration and setup - - use_path = True - - if 'NSYS_HOME' in os.environ: - NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME')) - use_path = False PARAVER_HOME = os.getenv('PARAVER_HOME') - REPORT_FILE = os.path.abspath(args.source_rep) - REPORT_DIR = os.path.dirname(REPORT_FILE) + MULTIREPORT = args.multi_report + if MULTIREPORT: + REPORTS_LIST = [os.path.abspath(x) for x in args.source_rep] + REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST] + REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report + else: + REPORT_FILE = os.path.abspath(args.source_rep[0]) + REPORT_DIR = os.path.dirname(REPORT_FILE) + trace_name = args.output NVTX_FILTER = args.filter_nvtx != None @@ -126,37 +130,18 @@ def main(): nvtx_stack_top = 1 nvtx_stack_bottom = 4 + nsi = NSYSInterface(reports, NVTX_FILTER, NVTX_RANGE, args.force_sqlite) - def build_nsys_stats_name(report_name): - base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0] - if NVTX_FILTER: - return os.path.join(REPORT_DIR, base_name+"_{}_nvtx={}.csv".format(report_name, NVTX_RANGE)) - else: - return os.path.join(REPORT_DIR, base_name+"_{}.csv".format(report_name)) - - + if MULTIREPORT: + print(f"Multiple reports provided: {REPORTS_LIST}") print("Extracting reports for: {}".format(reports_og)) - if use_path: - nsys_binary = ("nsys",) - else: - nsys_binary = (os.path.join(NSYS_HOME, "bin/nsys"),) - if not os.path.exists(f"{os.path.splitext(os.path.basename(REPORT_FILE))[0]}.sqlite"): - #Try exporting first - export_call = nsys_binary + ("export", "-t", "sqlite", REPORT_FILE) - try: - with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: - for line in p.stdout: - print(line.decode(), end='') - - if p.returncode != 0: - raise ChildProcessError(p.returncode, p.args) - except FileNotFoundError: - print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") - exit(1) - except ChildProcessError: - print("Could not export SQLite database. Exiting.") - exit(1) + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + print(f"Exporting SQLite databse for {os.path.basename(REPORT_FILE_I)}") + nsi.check_export_report(REPORT_FILE_I) + else: + nsi.check_export_report(REPORT_FILE) engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") metadata = pd.read_sql_table("META_DATA_EXPORT", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") @@ -164,149 +149,297 @@ def main(): if int(minor_version["value"].iloc[0]) > 11: print(f"\033[93m Warning! The SQLite schema version {int(minor_version["value"].iloc[0])} is greater than the one supported (11). If unexpected behaviour occurs, please report it. \033[00m") - nsys_call = nsys_binary + ("stats", "-r", ",".join(reports), - "--timeunit", "nsec", "-f", "csv", - "--force-overwrite", "true", "-o", ".") - - if NVTX_FILTER: - nsys_call += ("--filter-nvtx="+NVTX_RANGE,) - - if args.force_sqlite: - nsys_call += ("--force-export", "true") - - nsys_call += (REPORT_FILE,) - - try: - with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: - for line in p.stdout: - print(line.decode(), end='') - - if p.returncode != 0: - raise ChildProcessError(p.returncode, p.args) - except FileNotFoundError: - print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") - exit(1) + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + print(f"Processing stats for {os.path.basename(REPORT_FILE_I)}") + nsi.call_stats(REPORT_FILE_I) + else: + nsi.call_stats(REPORT_FILE) + # MARK: IMPORT DATASETS print("Importing datasets...") - # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace")) - # kernels_df.rename(columns={"CorrId": "CorrID"}, inplace=True) - with engine.connect() as conn, conn.begin(): - with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query: - kernels_df = pd.read_sql_query(text(query.read()), conn) + kernels_df = [] + if MULTIREPORT: + sum = 0 + for REPORT_FILE_I in REPORTS_LIST: + ksi = KernelsSemantic(REPORT_FILE_I) + ksi.Setup() + ksi.load_data() + kernels_df.append(ksi.get_df()) + sum += ksi.get_df().shape[0] + del ksi + else: + ks = KernelsSemantic(REPORT_FILE) + ks.Setup() + ks.load_data() + kernels_df = ks.get_df() if t_apicalls: - cuda_api_df = pd.read_csv(build_nsys_stats_name("cuda_api_trace")) + cuda_api_df = [] + if MULTIREPORT: + for i, REPORT_FILE_I in enumerate(REPORTS_LIST): + cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "cuda_api_trace"))) + else: + cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "cuda_api_trace")) else: cuda_api_df = pd.DataFrame() if t_nvtx: - nvtx_df = pd.read_csv(build_nsys_stats_name("nvtx_pushpop_trace")) - nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0] + nvtx_df = [] + if MULTIREPORT: + for i, REPORT_FILE_I in enumerate(REPORTS_LIST): + nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace"))) + nvtx_df[i]["domain"] = nvtx_df[i]["Name"].str.split(":").str[0] + nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + + else: + nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace")) + nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0] + nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + else: nvtx_df = pd.DataFrame() if t_nvtx_startend: - with engine.connect() as conn, conn.begin(): - with open(os.path.join(os.path.dirname(__file__), 'scripts/nvtx_startend_trace.sql'), 'r') as query: - nvtx_startend_df = pd.read_sql_query(text(query.read()), conn) + nvtx_startend_df = [] + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + ksi = NVTXStartEndSemantic(REPORT_FILE_I) + ksi.Setup() + ksi.load_data() + nvtx_startend_df.append(ksi.get_df()) + del ksi + else: + ks = NVTXStartEndSemantic(REPORT_FILE) + ks.Setup() + ks.load_data() + nvtx_startend_df = ks.get_df() + del ks else: nvtx_startend_df = pd.DataFrame() if t_mpi: - with engine.connect() as conn, conn.begin(): - try: - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_p2p.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_P2P_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_START_WAIT_EVENTS'): - mpi_p2p_df = pd.read_sql_query(text(query.read()), conn) - mpi_p2p_df["event_type"] = MPITYPE_PTOP - else: mpi_p2p_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_coll.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'): - mpi_coll_df = pd.read_sql_query(text(query.read()), conn) - mpi_coll_df = mpi_coll_df.drop(mpi_coll_df[mpi_coll_df["Event"].str.contains("File") ].index) - mpi_coll_df["event_type"] = MPITYPE_COLLECTIVE - else: mpi_coll_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'): - mpi_other_df = pd.read_sql_query(text(query.read()), conn) - mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("File") ].index) - mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index) - mpi_other_df["event_type"] = MPITYPE_OTHER - else: mpi_other_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'): - mpi_rma_df = pd.read_sql_query(text(query.read()), conn) - mpi_rma_df = mpi_rma_df[mpi_rma_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")] - mpi_rma_df["event_type"] = MPITYPE_RMA - else: mpi_rma_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_io.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'): - mpi_io_df = pd.read_sql_query(text(query.read()), conn) - mpi_io_df = mpi_io_df[mpi_io_df["Event"].str.contains("File")] - mpi_io_df["event_type"] = MPITYPE_IO - else: mpi_io_df = pd.DataFrame() - mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df]) - except OperationalError as oe: - print("There has been a problem fetching MPI information. MPI data will be skipped.") - print(f"[ERROR]: {oe.detail}") - t_mpi = False - #mpi_df = pd.read_csv(build_nsys_stats_name("mpi_event_trace")) + mpi_df = [] + try: + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + kp2pi = MPIP2PSemantic(REPORT_FILE_I) + kp2pi.Setup() + kp2pi.load_data() + + kcolli = MPICollSemantic(REPORT_FILE_I) + kcolli.Setup() + kcolli.load_data() + + kotheri = MPIOtherSemantic(REPORT_FILE_I) + kotheri.Setup() + kotheri.load_data() + + krmai = MPIRMASemantic(REPORT_FILE_I) + krmai.Setup() + krmai.load_data() + + kioi = MPIIOPSemantic(REPORT_FILE_I) + kioi.Setup() + kioi.load_data() + + mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True)) + del kp2pi, kcolli, kotheri, krmai, kioi + else: + kmpi = MPIP2PSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_p2p_df = kmpi.get_df() + + kmpi = MPICollSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_coll_df = kmpi.get_df() + + kmpi = MPIOtherSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_other_df = kmpi.get_df() + + kmpi = MPIRMASemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_rma_df = kmpi.get_df() + + kmpi = MPIIOPSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_io_df = kmpi.get_df() + mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True) + del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df + except OperationalError as oe: + print("There has been a problem fetching MPI information. MPI data will be skipped.") + print(f"[ERROR]: {oe.args[0]}") + t_mpi = False else: - #mpi_df = pd.DataFrame() - mpi_p2p_df = pd.DataFrame() - mpi_coll_df = pd.DataFrame() - mpi_other_df = pd.DataFrame() - mpi_rma_df = pd.DataFrame() - mpi_io_df = pd.DataFrame() - - # Obtain context Info - context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") + mpi_df = pd.DataFrame() + + gpu_metrics_agg = [] + metrics_event_names = [] + if t_metrics: + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + ksi = GPUMetricsSemantic(REPORT_FILE_I) + ksi.Setup() + ksi.load_data() + gpu_metrics_agg.append(ksi.get_df()) + metrics_event_names.append(ksi.get_names()) + del ksi + else: + ks = GPUMetricsSemantic(REPORT_FILE) + ks.Setup() + ks.load_data() + gpu_metrics_agg = ks.get_df() + metrics_event_names = ks.get_names() + del ks + + if t_openacc: + if MULTIREPORT: + openacc_other_df = [] + openacc_launch_df = [] + openacc_data_df = [] + for REPORT_FILE_I in REPORTS_LIST: + ksio = OpenACCOtherSemantic(REPORT_FILE_I) + ksio.Setup() + ksio.load_data() + openacc_other_df.append(ksio.get_df()) + ksil = OpenACCLaunchSemantic(REPORT_FILE_I) + ksil.Setup() + ksil.load_data() + openacc_launch_df.append(ksil.get_df()) + ksid = OpenACCDataSemantic(REPORT_FILE_I) + ksid.Setup() + ksid.load_data() + openacc_data_df.append(ksid.get_df()) + del ksio, ksil, ksid + else: + kso = OpenACCOtherSemantic(REPORT_FILE_I) + kso.Setup() + kso.load_data() + openacc_other_df = kso.get_df() + ksl = OpenACCLaunchSemantic(REPORT_FILE_I) + ksl.Setup() + ksl.load_data() + openacc_launch_df = ksl.get_df() + ksd = OpenACCDataSemantic(REPORT_FILE_I) + ksd.Setup() + ksd.load_data() + openacc_data_df = ksd.get_df() + del kso, ksl, ksd + openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", f"sqlite:///{os.path.splitext(REPORTS_LIST[0])[0]}.sqlite") + + + # MARK: CONTEXT INFO + list_contexts = [] + list_hostnames = [] + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + target_system_env_i = pd.read_sql_table("TARGET_INFO_SYSTEM_ENV", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + hostname = target_system_env_i.loc[target_system_env_i["name"] == "Hostname"]["value"].iloc[0] + context_info_i["hostname"] = hostname + list_hostnames.append(hostname) + list_contexts.append(context_info_i) + context_info = pd.concat(list_contexts) + else: + context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") + context_info.sort_values(["processId"], inplace=True) + + # CONTEXT INFO CHECK FOR MULTIREPORT + #if context_info.groupby(["hostname"]).agg({'deviceId': 'count'}) + if context_info["deviceId"].unique().size == 1: + print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \033[00m") + if t_mpi: - mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;" - with engine.connect() as conn, conn.begin(): - rank_info = pd.read_sql_query(mpi_query, conn) + if MULTIREPORT: + list_ranks = [] + for REPORT_FILE_I in REPORTS_LIST: + mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;" + engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + with engine.connect() as conn, conn.begin(): + list_ranks.append(pd.read_sql_query(mpi_query, conn)) + rank_info = pd.concat(list_ranks) - context_info.sort_values(["processId"], inplace=True) - if t_metrics: - gpu_metrics = pd.read_sql_table("GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") - metrics_description = pd.read_sql_table("TARGET_INFO_GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") - gpu_metrics.drop(gpu_metrics[gpu_metrics["timestamp"] < 0].index, inplace=True) # drop negative time - metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index() - metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base - #gpu_metrics["task"] = gpu_metrics.groupby(["typeId"]).ngroup() + 1 - gpu_metrics["deviceId"] = gpu_metrics["typeId"].apply(lambda x: x & 0xFF) - gpu_metrics_agg = gpu_metrics.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base), - 'value': lambda x: list(x), - 'deviceId': 'first'}) - gpu_metrics_agg.reset_index(inplace=True) + # MARK: MERGING AND ALIGNING + if MULTIREPORT: + # Find delta between earliest trace start and the others + session_time = [] + for REPORT_FILE_I in REPORTS_LIST: + session_time.append(pd.read_sql_table("TARGET_INFO_SESSION_START_TIME", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")) + + session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs + earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf')) + deltas = [start - earliest_time for start in session_time] + for i, df in enumerate(kernels_df): + df['Start (ns)'] += deltas[i] + kernels_df = pd.concat(kernels_df, ignore_index=True) + + if t_apicalls: + for i, df in enumerate(cuda_api_df): + df['Start (ns)'] += deltas[i] + cuda_api_df = pd.concat(cuda_api_df, ignore_index=True) - if t_openacc: - with engine.connect() as conn, conn.begin(): - with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_other.sql'), 'r') as query: - openacc_other_df = pd.read_sql_query(text(query.read()), conn) - with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_launch.sql'), 'r') as query: - openacc_launch_df = pd.read_sql_query(text(query.read()), conn) - with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_data.sql'), 'r') as query: - openacc_data_df = pd.read_sql_query(text(query.read()), conn) - openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn) + if t_nvtx: + for i, df in enumerate(nvtx_df): + df['Start (ns)'] += deltas[i] + df['End (ns)'] += deltas[i] + nvtx_df = pd.concat(nvtx_df, ignore_index=True) + + if t_nvtx_startend: + for i, df in enumerate(nvtx_startend_df): + df['Start (ns)'] += deltas[i] + df['End (ns)'] += deltas[i] + nvtx_startend_df = pd.concat(nvtx_startend_df, ignore_index=True) + if t_mpi: + for i, df in enumerate(mpi_df): + df['Start:ts_ns'] += deltas[i] + df['End:ts_ns'] += deltas[i] + mpi_df = pd.concat(mpi_df, ignore_index=True) + + if t_openacc: + for i, df in enumerate(openacc_other_df): + df['start'] += deltas[i] + df['end'] += deltas[i] + for i, df in enumerate(openacc_launch_df): + df['start'] += deltas[i] + df['end'] += deltas[i] + for i, df in enumerate(openacc_data_df): + df['start'] += deltas[i] + df['end'] += deltas[i] + openacc_other_df = pd.concat(openacc_other_df, ignore_index=True) + openacc_launch_df = pd.concat(openacc_launch_df, ignore_index=True) + openacc_data_df = pd.concat(openacc_data_df, ignore_index=True) + + if t_metrics: + for i, df in enumerate(gpu_metrics_agg): + if not df.empty: + df['timestamp'] += deltas[i] + # Complement with processId and node info + df['Pid'] = df['deviceId'].map(context_info[context_info["hostname"] == list_hostnames[i]].set_index("deviceId")["processId"]) + gpu_metrics_agg = pd.concat(gpu_metrics_agg, ignore_index=True) + metrics_event_names = pd.concat(metrics_event_names, ignore_index=True).drop_duplicates() - # # Building object model + # MARK: PROCESS MODEL # ## Tasks and threads # Now, find unique appearences of ThreadID and ProcessID if t_apicalls: print("CUDA calls unique processes: {}, and unique threads: {}".format(cuda_api_df["Pid"].unique(), cuda_api_df["Tid"].unique())) - if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["PID"].unique(), nvtx_df["TID"].unique())) + if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["Pid"].unique(), nvtx_df["Tid"].unique())) if t_nvtx_startend: print("NVTX startend unique processes: {}, and unique threads: {}".format(nvtx_startend_df["Pid"].unique(), nvtx_startend_df["Tid"].unique())) if t_mpi: print("MPI calls unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique())) if t_openacc: print("OpenACC calls unique processes: {}, and unique threads: {}".format(openacc_other_df["Pid"].unique(), openacc_other_df["Tid"].unique())) - if t_nvtx: nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) compute_threads_with = [] if t_apicalls: compute_threads_with.append(cuda_api_df[['Pid', 'Tid']]) @@ -332,6 +465,7 @@ def main(): 'Tid': lambda x: set(x), 'thread': 'count', 'device': 'first' }) + print(tasks_set) cuda_api_df["thread"] = 0 cuda_api_df["task"] = 0 @@ -389,50 +523,51 @@ def main(): streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates() - streams["thread"] = streams.groupby(["Device"]).cumcount() + 1 + streams["thread"] = streams.groupby(["Pid", "Device"]).cumcount() + 1 #streams["deviceid"] = streams.sort_values("Device").groupby(["Device"]).ngroup() #streams["Pid"] = streams["deviceid"].map(tasks_set.set_index("device")["Pid"]) - streams["task"] = streams["deviceid"].map(tasks_set.reset_index().set_index("device")["task"]) + streams["task"] = streams["Pid"].map(tasks_set.reset_index().set_index("Pid")["task"]) streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str) num_streams = streams.count().iloc[0] streams.sort_values(["Pid", "thread"], inplace=True) streams.reset_index(inplace=True) - devices_set = streams.groupby(["deviceid"]).agg({'Device': 'first', + devices_set = streams.groupby(["Pid", "deviceid"]).agg({'Device': 'first', 'Strm': lambda x: set(x), 'thread': 'count', - 'task': 'first', - 'Pid': 'last'}) + 'task': 'first'}) + print(devices_set) # Here we finally update the threadId we are going to put in the event record of kernel executions to respect the normal threads before CUDA streams num_normal_threads = tasks_set['thread'] num_normal_threads_repeated = num_normal_threads.repeat(devices_set["thread"]).reset_index()[["thread"]] - streams['thread'] = streams['thread'] + num_normal_threads_repeated["thread"] + # for index,row in kernels_df.iterrows(): # kernels_df.at[index, "thread"] = streams.at[(streams["Strm"] == row["Strm"]).idxmax(), "thread"] # kernels_df.at[index, "deviceid"] = streams.at[(streams["Device"] == row["Device"]).idxmax(), "deviceid"] # More efficient way by chatgpt # First, let's filter streams DataFrame based on conditions - filtered_streams = streams.groupby(["Device", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index() + filtered_streams = streams.groupby(["Pid", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index() # Now, merge the filtered streams DataFrame with kernels_df - result_df = kernels_df.merge(filtered_streams, how='left', on=['Device', 'Strm']) + result_df = kernels_df.merge(filtered_streams, how='left', on=["Pid", 'Strm']) + # Copy the results back to kernels_df - kernels_df['thread'] = result_df['thread'] - kernels_df['task'] = result_df['task'] + kernels_df['thread'] = result_df['thread'].to_numpy() + kernels_df['task'] = result_df['task'].to_numpy() # Add auxiliary stream to streams dataframe if t_metrics: - aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task"]] + aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task", "Pid"]] aux_streams["Strm"] = 99 aux_streams["row_name"] = "Metrics GPU"+aux_streams["deviceid"].astype(str) - aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"]) - aux_streams["thread"] = aux_streams["thread"] + aux_streams["deviceid"].map(tasks_set.set_index('device')['thread']) + 1 - gpu_metrics_agg["task"] = gpu_metrics_agg["deviceId"].map(devices_set["task"]) + #aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"]) + aux_streams["thread"] = aux_streams["thread"] + aux_streams["Pid"].map(tasks_set.set_index('Pid')['thread']) + 1 + gpu_metrics_agg["task"] = gpu_metrics_agg["Pid"].map(devices_set.reset_index().set_index("Pid")["task"]) gpu_metrics_agg["thread"] = gpu_metrics_agg["task"].map(aux_streams.set_index('task')["thread"]) streams = pd.concat([streams, aux_streams]).sort_values(['task', 'thread']) @@ -441,9 +576,6 @@ def main(): # ## Writing ROW file # Now we can write the _row_ file with this information - print(tasks_set) - print(devices_set) - print(" -Writing resource model to row file...") row_df = pd.concat([threads[["thread", "task", "row_name"]], streams[["thread", "task", "row_name"]]]) @@ -460,7 +592,7 @@ def main(): row_file.write("\n") - # # Collecting event values + # MARK: EVENT NAMES # Second step is collect all different event values for CUDA API calls, kernel names, and NVTX ranges. Each of these define a different event type, and will need unique identifiers to be used as a event values. Finally these needs to be dumped to the PCF file. print("Collecting event names and information...") @@ -490,7 +622,7 @@ def main(): kernel_names["Name"] = kernel_names["Name"].apply(lambda x: x.replace("[", "").replace("]", "")) if t_nvtx: - nvtx_df_subset = nvtx_df + nvtx_df_subset = nvtx_df.reset_index() lower_level = max(nvtx_df["Lvl"]) if nvtx_select_frames: @@ -791,9 +923,9 @@ GRADIENT_NAMES pcf_file.write("{} {}\n".format(row["func_value"], row["func"])) pcf_file.write("\n") + # MARK: MEMORY # # Split of kernel execution between compute and memory - memops_names = ["[CUDA memcpy Device-to-Device]", "[CUDA memcpy Device-to-Host]", "[CUDA memcpy Host-to-Device]", "[CUDA memset]", "[CUDA memcpy Peer-to-Peer]"] memops_df = kernels_df.loc[kernels_df["Name"].isin(memops_names)] mask = ~kernels_df.index.isin(memops_df.index) @@ -805,6 +937,7 @@ GRADIENT_NAMES comm_memory_df = cuda_api_df.merge(memops_df, how="inner", left_on=["CorrID", "task"], right_on=["CorrID", "task"], suffixes=("_call", "_mem"), validate="one_to_one") + # MARK: TIMELINE RECONS # # Timeline reconstruction print("Reconstructing timeline...") @@ -924,6 +1057,8 @@ GRADIENT_NAMES print(f"Congratulations! Trace {trace_name}.prv correctly translated.") + + # MARK: POSTPROCESSING # ## Postprocessing # - Reorder trace # - GZip trace diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..19762807643a1502861a372168ee81c2c5a4bb3d --- /dev/null +++ b/nsys2prv/semantics/__init__.py @@ -0,0 +1,5 @@ +from .kernels_semantic import KernelsSemantic +from .mpi_semantic import * +from .nvtx_startend_semantic import NVTXStartEndSemantic +from .gpu_metrics_semantic import GPUMetricsSemantic +from .openacc_semantic import * \ No newline at end of file diff --git a/nsys2prv/semantics/gpu_metrics_semantic.py b/nsys2prv/semantics/gpu_metrics_semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..6776bb446e1d3947ad0538c7c65e5e43c96c40a0 --- /dev/null +++ b/nsys2prv/semantics/gpu_metrics_semantic.py @@ -0,0 +1,33 @@ +from .nsys_event import NsysEvent +from pandas import read_sql_table, DataFrame +from sqlalchemy import text + +event_type_metrics_base = 9400 + + +class GPUMetricsSemantic(NsysEvent): + def __init__(self, report) -> None: + self.metrics_event_names = DataFrame() + super().__init__(report) + + def Setup(self): + if self.check_table("GPU_METRICS"): + self.query = text("SELECT * FROM GPU_METRICS") + else: + + self._empty = True + + def _preprocess(self): + metrics_description = read_sql_table("TARGET_INFO_GPU_METRICS", self._dbcon) + self._df.drop(self._df[self._df["timestamp"] < 0].index, inplace=True) # drop negative time + self.metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index() + self.metrics_event_names["metricId"] = self.metrics_event_names["metricId"] + event_type_metrics_base + self._df["deviceId"] = self._df["typeId"].apply(lambda x: x & 0xFF) + self._df = self._df.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base), + 'value': lambda x: list(x), + 'deviceId': 'first'}) + self._df.reset_index(inplace=True) + return super()._preprocess() + + def get_names(self): + return self.metrics_event_names \ No newline at end of file diff --git a/nsys2prv/semantics/kernels_semantic.py b/nsys2prv/semantics/kernels_semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..804128e614b22fb92dc08a23edab57aa0005d505 --- /dev/null +++ b/nsys2prv/semantics/kernels_semantic.py @@ -0,0 +1,12 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class KernelsSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/kernels.sql'), 'r') as query: + self.query = text(query.read()) + \ No newline at end of file diff --git a/nsys2prv/semantics/mpi_semantic.py b/nsys2prv/semantics/mpi_semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..a4763766e1772da3443b5d4ce32441d0bcc38a33 --- /dev/null +++ b/nsys2prv/semantics/mpi_semantic.py @@ -0,0 +1,78 @@ +from .nsys_event import NsysEvent +import os.path +from .mpi_event_encoding import * +from sqlalchemy import text + +class MPIP2PSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table('MPI_P2P_EVENTS') and self.check_table('MPI_START_WAIT_EVENTS'): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_p2p.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + def _preprocess(self): + self._df["event_type"] = MPITYPE_PTOP + return super()._preprocess() + +class MPICollSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_COLLECTIVES_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_coll.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + + def _preprocess(self): + self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index) + self._df["event_type"] = MPITYPE_COLLECTIVE + +class MPIOtherSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_OTHER_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + + def _preprocess(self): + self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index) + self._df = self._df.drop(self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index) + self._df["event_type"] = MPITYPE_OTHER + +class MPIRMASemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_OTHER_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + def _preprocess(self): + self._df = self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")] + self._df["event_type"] = MPITYPE_RMA + +class MPIIOPSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_OTHER_EVENTS") and self.check_table("MPI_COLLECTIVES_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_io.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + + def _preprocess(self): + self._df = self._df[self._df["Event"].str.contains("File")] + self._df["event_type"] = MPITYPE_IO \ No newline at end of file diff --git a/nsys2prv/semantics/nsys_event.py b/nsys2prv/semantics/nsys_event.py new file mode 100644 index 0000000000000000000000000000000000000000..9813a14ef3f7785712ad84013128e02481cf88b9 --- /dev/null +++ b/nsys2prv/semantics/nsys_event.py @@ -0,0 +1,68 @@ +from sqlalchemy import create_engine, exc, inspect +import pandas as pd +import os.path + +class NsysEvent: + + class MissingDatabaseFile(Exception): + def __init__(self, filename): + super().__init__(f'Database file {filename} does not exist.') + + class InvalidDatabaseFile(Exception): + def __init__(self, filename): + super().__init__(f'Database file {filename} could not be opened and appears to be invalid.') + + class InvalidSQL(Exception): + def __init__(self, sql): + super().__init__(f'Bad SQL statement: {sql}') + + query = "SELECT 1 AS 'ONE'" + + def __init__(self, report) -> None: + self._dbcon = None + self._dbfile = f"{os.path.splitext(report)[0]}.sqlite" + self._df = pd.DataFrame() + self._empty = False + + if not os.path.exists(self._dbfile): + raise self.MissingDatabaseFile(self._dbfile) + + try: + self._dbcon = create_engine(f"sqlite:///{self._dbfile}") + except exc.SQLAlchemyError: + self._dbcon = None + raise self.InvalidDatabaseFile(self._dbfile) + + def check_table(self, table_name): + insp = inspect(self._dbcon) + return insp.has_table(table_name) + + def Setup(self): + pass + + def _preprocess(self): + pass + + def postprocess(self): + pass + + def load_data(self): + if not self._empty: + try: + self._df = pd.read_sql_query(self.query, self._dbcon) + except pd.errors.DatabaseError: + raise self.InvalidSQL(self.query) + self._preprocess() + + def apply_process_model(self, threads=pd.DataFrame, streams=pd.DataFrame): + self.df["thread"] = self.df["Tid"].map(threads.set_index('Tid')["thread"]) + self.df["task"] = self.df["Tid"].map(threads.set_index('Tid')["task"]) + if 'Rank' in threads.columns: + self.df["Rank"] = self.df["Tid"].map(threads.set_index('Tid')["Rank"]) + pass + + def get_threads(self): + return self._df[['Pid', 'Tid']].drop_duplicates() + + def get_df(self): + return self._df.copy() \ No newline at end of file diff --git a/nsys2prv/semantics/nvtx_startend_semantic.py b/nsys2prv/semantics/nvtx_startend_semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..88241b72f2e762462e9117123a3af103ce33bab2 --- /dev/null +++ b/nsys2prv/semantics/nvtx_startend_semantic.py @@ -0,0 +1,11 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class NVTXStartEndSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_startend_trace.sql'), 'r') as query: + self.query = text(query.read()) \ No newline at end of file diff --git a/nsys2prv/semantics/openacc_semantic.py b/nsys2prv/semantics/openacc_semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..105971928ecc9b0f15511d48909b433692ad8989 --- /dev/null +++ b/nsys2prv/semantics/openacc_semantic.py @@ -0,0 +1,27 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class OpenACCOtherSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_other.sql'), 'r') as query: + self.query = text(query.read()) + +class OpenACCLaunchSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_launch.sql'), 'r') as query: + self.query = text(query.read()) + +class OpenACCDataSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_data.sql'), 'r') as query: + self.query = text(query.read()) \ No newline at end of file diff --git a/parser-playground.ipynb b/parser-playground.ipynb index 10a24122f26de9d7757f71c6283f6d662f5ba30c..c4b2ad5e8356d7be3d919434d5587b3f233925d1 100644 --- a/parser-playground.ipynb +++ b/parser-playground.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -14,6 +14,10 @@ "import locale\n", "import sqlite3\n", "from sqlalchemy import create_engine, text\n", + "from nsys2prv.EventWriter import event_writer as ewr\n", + "from nsys2prv.NSYSInterface import NSYSInterface\n", + "from nsys2prv.semantics.mpi_event_encoding import *\n", + "from nsys2prv.semantics import *\n", "\n", "NSYS_HOME = os.path.abspath(\"/home/mclasca/Apps/nsight-system/2024.5.1/\")\n", "#NSIGHT_HOME = os.getenv('NSIGHT_HOME')\n", @@ -27,6 +31,12 @@ "#REPORT_NAME=\"heka-step53+accum1-profile-2023.4-5721957\"\n", "#REPORT_NAME=\"heka-axolotl-Mistral7B0.1-profile-2110598\"\n", "\n", + "MULTIREPORT = True\n", + "if MULTIREPORT:\n", + " REPORTS_LIST = [os.path.abspath(x) for x in [\"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_0.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_1.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_2.nsys-rep\"]]\n", + " REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]\n", + " REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report\n", + "\n", "locale.setlocale(locale.LC_ALL, '')\n", "\n", "trace_name = \"test-heka\"\n", @@ -69,6 +79,7 @@ "nvtx_stack_bottom = 4\n", "\n", "reports = [\"cuda_api_trace\", \"cuda_gpu_trace\"]\n", + "nsi = NSYSInterface(reports, False, NVTX_RANGE, False)\n", "\n", "def build_nsys_stats_name(report_name):\n", " base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]\n", @@ -686,7 +697,125 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "list_contexts = []\n", + "if MULTIREPORT:\n", + " for REPORT_FILE_I in REPORTS_LIST:\n", + " context_info_i = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\")\n", + " list_contexts.append(context_info_i)\n", + " context_info = pd.concat(list_contexts)\n", + "else:\n", + " context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n", + "context_info.sort_values([\"processId\"], inplace=True)\n", + "\n", + "# CONTEXT INFO CHECK FOR MULTIREPORT\n", + "if context_info[\"deviceId\"].unique().size == 0:\n", + " print(f\"\\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \\033[00m\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "t_apicalls = True\n", + "cuda_api_df = []\n", + "if MULTIREPORT:\n", + " for i, REPORT_FILE_I in enumerate(REPORTS_LIST):\n", + " cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], \"cuda_api_trace\")))\n", + "else:\n", + " cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, \"cuda_api_trace\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "kernels_df = []\n", + "if MULTIREPORT:\n", + " sum = 0\n", + " for REPORT_FILE_I in REPORTS_LIST:\n", + " ksi = KernelsSemantic(REPORT_FILE_I)\n", + " ksi.Setup()\n", + " ksi.load_data()\n", + " kernels_df.append(ksi.get_df())\n", + " sum += ksi.get_df().shape[0]\n", + " del ksi\n", + "else:\n", + " ks = KernelsSemantic(REPORT_FILE)\n", + " ks.Setup()\n", + " ks.load_data()\n", + " kernels_df = ks.get_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After concat: (21299, 23)\n" + ] + } + ], + "source": [ + "from functools import reduce\n", + "# MARK: MERGING AND ALIGNING\n", + "if MULTIREPORT:\n", + " # Find delta between earliest trace start and the others\n", + " session_time = []\n", + " for REPORT_FILE_I in REPORTS_LIST:\n", + " session_time.append(pd.read_sql_table(\"TARGET_INFO_SESSION_START_TIME\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\"))\n", + " \n", + " session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs\n", + " earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf'))\n", + " deltas = [start - earliest_time for start in session_time]\n", + " for i, df in enumerate(kernels_df):\n", + " df['Start (ns)'] += deltas[i]\n", + " kernels_df = pd.concat(kernels_df)\n", + " print(f\"After concat: {kernels_df.shape}\")\n", + "\n", + " if t_apicalls:\n", + " for i, df in enumerate(cuda_api_df):\n", + " df['Start (ns)'] += deltas[i]\n", + "\n", + " cuda_api_df = pd.concat(cuda_api_df)\n", + "\n", + " # if t_nvtx:\n", + " # for i, df in enumerate(nvtx_df):\n", + " # df['Start (ns)'] += deltas[i]\n", + " # df['End (ns)'] += deltas[i]\n", + " # nvtx_df = pd.concat(nvtx_df)\n", + "\n", + " # if t_nvtx_startend:\n", + " # for i, df in enumerate(nvtx_startend_df):\n", + " # df['Start (ns)'] += deltas[i]\n", + " # df['End (ns)'] += deltas[i]\n", + " # nvtx_startend_df = pd.concat(nvtx_startend_df)\n", + "\n", + " # if t_mpi:\n", + " # for i, df in enumerate(mpi_df):\n", + " # df['Start:ts_ns'] += deltas[i]\n", + " # df['End:ts_ns'] += deltas[i]\n", + " # mpi_df = pd.concat(mpi_df)\n", + " \n", + " #if t_metrics:\n", + "\n", + " #if t_openacc:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -710,369 +839,432 @@ " \n", " \n", " \n", - " Start (ns)\n", - " Duration (ns)\n", - " CorrID\n", - " GrdX\n", - " GrdY\n", - " GrdZ\n", - " BlkX\n", - " BlkY\n", - " BlkZ\n", - " Reg/Trd\n", - " ...\n", - " DymSMem (MB)\n", - " Bytes (MB)\n", - " Throughput (MB/s)\n", - " SrcMemKd\n", - " DstMemKd\n", - " Device\n", - " Ctx\n", - " GreenCtx\n", - " Strm\n", - " Name\n", + " Pid\n", + " Tid\n", + " thread\n", + " device\n", " \n", - " \n", - " \n", " \n", - " 0\n", - " 1307261431\n", - " 992\n", - " 688\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " 0,000\n", - " 32,258\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (1)\n", - " 1\n", - " NaN\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " task\n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " 1\n", - " 1662416719\n", - " 960\n", - " 1285\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " 0,000\n", - " 133,333\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (1)\n", + " 2308061\n", + " {2308061}\n", " 1\n", - " NaN\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " 0\n", " \n", " \n", " 2\n", - " 1662448494\n", - " 736\n", - " 1291\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " 0,000\n", - " 173,913\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (1)\n", + " 2308062\n", + " {2308062}\n", " 1\n", - " NaN\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " 2\n", " \n", " \n", " 3\n", - " 1662469326\n", - " 768\n", - " 1297\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " 0,000\n", - " 166,667\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (1)\n", + " 2308065\n", + " {2308065}\n", " 1\n", - " NaN\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", - " \n", - " \n", - " 4\n", - " 1662702764\n", - " 704\n", - " 1327\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " 0,000\n", - " 34,091\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (1)\n", " 1\n", - " NaN\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " Pid Tid thread device\n", + "task \n", + "1 2308061 {2308061} 1 0\n", + "2 2308062 {2308062} 1 2\n", + "3 2308065 {2308065} 1 1" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compute_threads_with = []\n", + "compute_threads_with.append(cuda_api_df[['Pid', 'Tid']])\n", + "# if t_nvtx: compute_threads_with.append(nvtx_df[[\"Pid\", \"Tid\"]])\n", + "# if t_nvtx_startend: compute_threads_with.append(nvtx_startend_df[[\"Pid\", \"Tid\"]])\n", + "# if t_mpi: compute_threads_with.append(mpi_df[[\"Pid\", \"Tid\"]])\n", + "# if t_openacc: compute_threads_with.append(openacc_other_df[[\"Pid\", \"Tid\"]])\n", + "\n", + "t_mpi = False\n", + "threads = pd.concat(compute_threads_with).drop_duplicates()\n", + "if t_mpi:\n", + " threads[\"Rank\"] = threads[\"Pid\"].map(rank_info.set_index(\"Pid\")[\"rank\"])\n", + " threads.sort_values([\"Rank\"], inplace=True)\n", + "else:\n", + " threads.sort_values([\"Pid\"], inplace=True)\n", + "threads[\"thread\"] = threads.groupby([\"Pid\"]).cumcount() + 1\n", + "threads[\"task\"] = threads.groupby([\"Pid\"]).ngroup() + 1\n", + "threads[\"device\"] = threads[\"Pid\"].map(context_info[context_info[\"contextId\"] == 1].set_index(\"processId\")[\"deviceId\"])\n", + "#threads.sort_values([\"task\", \"thread\"], inplace=True)\n", + "threads.reset_index()\n", + "\n", + "tasks_set = threads.groupby([\"task\"]).agg({'Pid': 'first',\n", + " 'Tid': lambda x: set(x),\n", + " 'thread': 'count',\n", + " 'device': 'first' })\n", + "tasks_set" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DeviceStrmthreadtaskPid
..................................................................Piddeviceid
66451182467332713785166861365535.01.01.032.01.01.036.0...0,001NaNNaNNaNNaNNVIDIA H100 (1)23080610NVIDIA H100 (0){16, 17, 18, 19}41NaN16mod_time_ops_adapt_dt_cfl_32_gpu2308061
66461182605270799903686143.01.01.0256.01.01.018.0...0,001NaNNaNNaNNaN23080622NVIDIA H100 (2){16, 17, 18, 19}422308062
23080651NVIDIA H100 (1){16, 17, 18}332308065
\n", + "
" + ], + "text/plain": [ + " Device Strm thread task Pid\n", + "Pid deviceid \n", + "2308061 0 NVIDIA H100 (0) {16, 17, 18, 19} 4 1 2308061\n", + "2308062 2 NVIDIA H100 (2) {16, 17, 18, 19} 4 2 2308062\n", + "2308065 1 NVIDIA H100 (1) {16, 17, 18} 3 3 2308065" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates()\n", + "streams[\"thread\"] = streams.groupby([\"Pid\", \"Device\"]).cumcount() + 1\n", + "#streams[\"deviceid\"] = streams.sort_values(\"Device\").groupby([\"Device\"]).ngroup()\n", + "#streams[\"Pid\"] = streams[\"deviceid\"].map(tasks_set.set_index(\"device\")[\"Pid\"])\n", + "streams[\"task\"] = streams[\"Pid\"].map(tasks_set.reset_index().set_index(\"Pid\")[\"task\"])\n", + "\n", + "streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str)\n", + "num_streams = streams.count().iloc[0]\n", + "streams.sort_values([\"Pid\", \"thread\"], inplace=True)\n", + "streams.reset_index(inplace=True)\n", + "\n", + "devices_set = streams.groupby([\"Pid\", \"deviceid\"]).agg({'Device': 'first',\n", + " 'Strm': lambda x: set(x),\n", + " 'thread': 'count',\n", + " 'task': 'first',\n", + " 'Pid': 'last'})\n", + "devices_set" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
indexDeviceStrmdeviceidPidthreadtaskrow_name
00NVIDIA H100 (0)160230806121NaNCUDA-D0.S16
1190NVIDIA H100 (0)170230806131CUDA-D0.S17
2191NVIDIA H100 (0)180230806141CUDA-D0.S18
3204NVIDIA H100 (0)190230806151CUDA-D0.S19
40NVIDIA H100 (2)16mod_time_ops_adapt_dt_cfl_32_gpu__red2230806222CUDA-D2.S16
664711826167106217668616NaNNaNNaNNaNNaNNaNNaN...NaN0,0001,838DevicePageable5190NVIDIA H100 (2)172230806232CUDA-D2.S17
6191NVIDIA H100 (2)182230806242CUDA-D2.S18
7203NVIDIA H100 (2)192230806252CUDA-D2.S19
80NVIDIA H100 (1)1NaN16[CUDA memcpy Device-to-Host]1230806523CUDA-D1.S16
664811826178754217668617NaNNaNNaNNaNNaNNaNNaN...NaN0,0001,838DevicePageable9190NVIDIA H100 (1)171NaN16[CUDA memcpy Device-to-Host]230806533CUDA-D1.S17
664911826190114217668618NaNNaNNaNNaNNaNNaNNaN...NaN0,0001,838DevicePageable10191NVIDIA H100 (1)181NaN16[CUDA memcpy Device-to-Host]230806543CUDA-D1.S18
\n", - "

6650 rows × 21 columns

\n", "
" ], "text/plain": [ - " Start (ns) Duration (ns) CorrID GrdX GrdY GrdZ BlkX BlkY \\\n", - "0 1307261431 992 688 NaN NaN NaN NaN NaN \n", - "1 1662416719 960 1285 NaN NaN NaN NaN NaN \n", - "2 1662448494 736 1291 NaN NaN NaN NaN NaN \n", - "3 1662469326 768 1297 NaN NaN NaN NaN NaN \n", - "4 1662702764 704 1327 NaN NaN NaN NaN NaN \n", - "... ... ... ... ... ... ... ... ... \n", - "6645 11824673327 1378516 68613 65535.0 1.0 1.0 32.0 1.0 \n", - "6646 11826052707 99903 68614 3.0 1.0 1.0 256.0 1.0 \n", - "6647 11826167106 2176 68616 NaN NaN NaN NaN NaN \n", - "6648 11826178754 2176 68617 NaN NaN NaN NaN NaN \n", - "6649 11826190114 2176 68618 NaN NaN NaN NaN NaN \n", - "\n", - " BlkZ Reg/Trd ... DymSMem (MB) Bytes (MB) Throughput (MB/s) SrcMemKd \\\n", - "0 NaN NaN ... NaN 0,000 32,258 Pageable \n", - "1 NaN NaN ... NaN 0,000 133,333 Pageable \n", - "2 NaN NaN ... NaN 0,000 173,913 Pageable \n", - "3 NaN NaN ... NaN 0,000 166,667 Pageable \n", - "4 NaN NaN ... NaN 0,000 34,091 Pageable \n", - "... ... ... ... ... ... ... ... \n", - "6645 1.0 36.0 ... 0,001 NaN NaN NaN \n", - "6646 1.0 18.0 ... 0,001 NaN NaN NaN \n", - "6647 NaN NaN ... NaN 0,000 1,838 Device \n", - "6648 NaN NaN ... NaN 0,000 1,838 Device \n", - "6649 NaN NaN ... NaN 0,000 1,838 Device \n", - "\n", - " DstMemKd Device Ctx GreenCtx Strm \\\n", - "0 Device NVIDIA H100 (1) 1 NaN 16 \n", - "1 Device NVIDIA H100 (1) 1 NaN 16 \n", - "2 Device NVIDIA H100 (1) 1 NaN 16 \n", - "3 Device NVIDIA H100 (1) 1 NaN 16 \n", - "4 Device NVIDIA H100 (1) 1 NaN 16 \n", - "... ... ... .. ... ... \n", - "6645 NaN NVIDIA H100 (1) 1 NaN 16 \n", - "6646 NaN NVIDIA H100 (1) 1 NaN 16 \n", - "6647 Pageable NVIDIA H100 (1) 1 NaN 16 \n", - "6648 Pageable NVIDIA H100 (1) 1 NaN 16 \n", - "6649 Pageable NVIDIA H100 (1) 1 NaN 16 \n", - "\n", - " Name \n", - "0 [CUDA memcpy Host-to-Device] \n", - "1 [CUDA memcpy Host-to-Device] \n", - "2 [CUDA memcpy Host-to-Device] \n", - "3 [CUDA memcpy Host-to-Device] \n", - "4 [CUDA memcpy Host-to-Device] \n", - "... ... \n", - "6645 mod_time_ops_adapt_dt_cfl_32_gpu \n", - "6646 mod_time_ops_adapt_dt_cfl_32_gpu__red \n", - "6647 [CUDA memcpy Device-to-Host] \n", - "6648 [CUDA memcpy Device-to-Host] \n", - "6649 [CUDA memcpy Device-to-Host] \n", - "\n", - "[6650 rows x 21 columns]" + " index Device Strm deviceid Pid thread task row_name\n", + "0 0 NVIDIA H100 (0) 16 0 2308061 2 1 CUDA-D0.S16\n", + "1 190 NVIDIA H100 (0) 17 0 2308061 3 1 CUDA-D0.S17\n", + "2 191 NVIDIA H100 (0) 18 0 2308061 4 1 CUDA-D0.S18\n", + "3 204 NVIDIA H100 (0) 19 0 2308061 5 1 CUDA-D0.S19\n", + "4 0 NVIDIA H100 (2) 16 2 2308062 2 2 CUDA-D2.S16\n", + "5 190 NVIDIA H100 (2) 17 2 2308062 3 2 CUDA-D2.S17\n", + "6 191 NVIDIA H100 (2) 18 2 2308062 4 2 CUDA-D2.S18\n", + "7 203 NVIDIA H100 (2) 19 2 2308062 5 2 CUDA-D2.S19\n", + "8 0 NVIDIA H100 (1) 16 1 2308065 2 3 CUDA-D1.S16\n", + "9 190 NVIDIA H100 (1) 17 1 2308065 3 3 CUDA-D1.S17\n", + "10 191 NVIDIA H100 (1) 18 1 2308065 4 3 CUDA-D1.S18" ] }, - "execution_count": 8, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kernels_df = pd.read_csv(build_nsys_stats_name(\"cuda_gpu_trace\"))\n", - "kernels_df.rename(columns={\"CorrId\": \"CorrID\"}, inplace=True)\n", - "kernels_df" + "num_normal_threads = tasks_set['thread']\n", + "num_normal_threads_repeated = num_normal_threads.repeat(devices_set[\"thread\"]).reset_index()[[\"thread\"]]\n", + "streams['thread'] = streams['thread'] + num_normal_threads_repeated[\"thread\"]\n", + "streams" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of original kernels_df: (21299, 25). Shape of result_df: (21299, 27).\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'thread'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'thread'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[19], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of original kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Shape of result_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Copy the results back to kernels_df\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mthread\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mresult_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mthread\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m 8\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m result_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of new kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'thread'" + ] + } + ], + "source": [ + "filtered_streams = streams.groupby([\"Pid\", \"Strm\"]).agg({'thread':'first', 'task':'first'}).reset_index()\n", + "# Now, merge the filtered streams DataFrame with kernels_df\n", + "result_df = kernels_df.merge(filtered_streams, how='left', on=[\"Pid\", 'Strm'])\n", + "print(f\"Shape of original kernels_df: {kernels_df.shape}. Shape of result_df: {result_df.shape}.\")\n", + "\n", + "# Copy the results back to kernels_df\n", + "kernels_df['thread'] = result_df['thread'].to_numpy()\n", + "kernels_df['task'] = result_df['task'].to_numpy()\n", + "print(f\"Shape of new kernels_df: {kernels_df.shape}.\")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1096,48 +1288,614 @@ " \n", " \n", " \n", - " nullStreamId\n", - " hwId\n", - " vmId\n", - " processId\n", - " deviceId\n", - " contextId\n", - " parentContextId\n", - " isGreenContext\n", + " Start (ns)\n", + " Duration:dur_ns\n", + " CorrID\n", + " GrdX\n", + " GrdY\n", + " GrdZ\n", + " BlkX\n", + " BlkY\n", + " BlkZ\n", + " Reg/Trd\n", + " ...\n", + " DstMemKd\n", + " Device\n", + " deviceid\n", + " Pid\n", + " Ctx\n", + " GreenCtx\n", + " Strm\n", + " Name\n", + " thread\n", + " task\n", " \n", " \n", " \n", " \n", - " 0\n", - " 7\n", - " 0\n", - " 1\n", - " 2308065\n", - " 1\n", + " 2023\n", + " 10256971039\n", + " 2272\n", + " 23576\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " Pageable\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " 0\n", - " 0\n", + " None\n", + " 16\n", + " [CUDA memcpy Device-to-Host]\n", + " 2\n", + " 2\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " nullStreamId hwId vmId processId deviceId contextId parentContextId \\\n", - "0 7 0 1 2308065 1 1 0 \n", + " \n", + " 2024\n", + " 10257043551\n", + " 1105690\n", + " 23590\n", + " 81902.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 32.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_solver_conjgrad_imex_245_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2025\n", + " 10258164089\n", + " 51168\n", + " 23595\n", + " 81902.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 26.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_bc_routines_bc_fix_dirichlet_residual_293_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2026\n", + " 10258227161\n", + " 768\n", + " 23598\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " [CUDA memset]\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2027\n", + " 10258236281\n", + " 897563\n", + " 23601\n", + " 65535.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 56.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_solver_conjgrad_imex_271_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2028\n", + " 10259135028\n", + " 97632\n", + " 23602\n", + " 1.0\n", + " 1.0\n", + " 1.0\n", + " 256.0\n", + " 1.0\n", + " 1.0\n", + " 16.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_solver_conjgrad_imex_271_gpu__red\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2029\n", + " 10259241107\n", + " 2273\n", + " 23604\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " Pageable\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " [CUDA memcpy Device-to-Host]\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2030\n", + " 10259285939\n", + " 768\n", + " 23616\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " [CUDA memset]\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2031\n", + " 10259294836\n", + " 412701\n", + " 23619\n", + " 65535.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 40.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_solver_conjgrad_imex_297_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2032\n", + " 10259708465\n", + " 96064\n", + " 23620\n", + " 1.0\n", + " 1.0\n", + " 1.0\n", + " 256.0\n", + " 1.0\n", + " 1.0\n", + " 16.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_solver_conjgrad_imex_297_gpu__red\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2033\n", + " 10259813009\n", + " 2272\n", + " 23622\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " Pageable\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " [CUDA memcpy Device-to-Host]\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2034\n", + " 10259840881\n", + " 751484\n", + " 23634\n", + " 81902.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 45.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_solver_conjgrad_imex_307_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2035\n", + " 10260609197\n", + " 149279\n", + " 23640\n", + " 245704.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 36.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " elem_diffu_full_diffusion_ijk_51_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2036\n", + " 10260759628\n", + " 55584\n", + " 23642\n", + " 81902.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 16.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " elem_diffu_full_diffusion_ijk_52_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2037\n", + " 10260829643\n", + " 7861560\n", + " 23646\n", + " 1246149.0\n", + " 1.0\n", + " 1.0\n", + " 96.0\n", + " 1.0\n", + " 1.0\n", + " 93.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " elem_diffu_full_diffusion_ijk_60_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2038\n", + " 10268707939\n", + " 8192\n", + " 23651\n", + " 3329.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 16.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_comms_fill_sendbuffer_real_239_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2039\n", + " 10268727331\n", + " 3392\n", + " 23655\n", + " 3329.0\n", + " 1.0\n", + " 1.0\n", + " 128.0\n", + " 1.0\n", + " 1.0\n", + " 16.0\n", + " ...\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 16\n", + " mod_comms_fill_sendbuffer_real_246_gpu\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 2040\n", + " 10268754723\n", + " 4672\n", + " 23663\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " Device\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 17\n", + " [CUDA memcpy Peer-to-Peer]\n", + " 3\n", + " 2\n", + " \n", + " \n", + " 2041\n", + " 10268759875\n", + " 3328\n", + " 23669\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " Device\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 19\n", + " [CUDA memcpy Peer-to-Peer]\n", + " 5\n", + " 2\n", + " \n", + " \n", + " 2042\n", + " 10268769987\n", + " 5568\n", + " 23688\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " Device\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", + " 1\n", + " None\n", + " 18\n", + " [CUDA memcpy Peer-to-Peer]\n", + " 4\n", + " 2\n", + " \n", + " \n", + "\n", + "

20 rows × 25 columns

\n", + "" + ], + "text/plain": [ + " Start (ns) Duration:dur_ns CorrID GrdX GrdY GrdZ BlkX \\\n", + "2023 10256971039 2272 23576 NaN NaN NaN NaN \n", + "2024 10257043551 1105690 23590 81902.0 1.0 1.0 128.0 \n", + "2025 10258164089 51168 23595 81902.0 1.0 1.0 128.0 \n", + "2026 10258227161 768 23598 NaN NaN NaN NaN \n", + "2027 10258236281 897563 23601 65535.0 1.0 1.0 128.0 \n", + "2028 10259135028 97632 23602 1.0 1.0 1.0 256.0 \n", + "2029 10259241107 2273 23604 NaN NaN NaN NaN \n", + "2030 10259285939 768 23616 NaN NaN NaN NaN \n", + "2031 10259294836 412701 23619 65535.0 1.0 1.0 128.0 \n", + "2032 10259708465 96064 23620 1.0 1.0 1.0 256.0 \n", + "2033 10259813009 2272 23622 NaN NaN NaN NaN \n", + "2034 10259840881 751484 23634 81902.0 1.0 1.0 128.0 \n", + "2035 10260609197 149279 23640 245704.0 1.0 1.0 128.0 \n", + "2036 10260759628 55584 23642 81902.0 1.0 1.0 128.0 \n", + "2037 10260829643 7861560 23646 1246149.0 1.0 1.0 96.0 \n", + "2038 10268707939 8192 23651 3329.0 1.0 1.0 128.0 \n", + "2039 10268727331 3392 23655 3329.0 1.0 1.0 128.0 \n", + "2040 10268754723 4672 23663 NaN NaN NaN NaN \n", + "2041 10268759875 3328 23669 NaN NaN NaN NaN \n", + "2042 10268769987 5568 23688 NaN NaN NaN NaN \n", + "\n", + " BlkY BlkZ Reg/Trd ... DstMemKd Device deviceid Pid \\\n", + "2023 NaN NaN NaN ... Pageable NVIDIA H100 (2) 2 2308062 \n", + "2024 1.0 1.0 32.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2025 1.0 1.0 26.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2026 NaN NaN NaN ... None NVIDIA H100 (2) 2 2308062 \n", + "2027 1.0 1.0 56.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2028 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2029 NaN NaN NaN ... Pageable NVIDIA H100 (2) 2 2308062 \n", + "2030 NaN NaN NaN ... None NVIDIA H100 (2) 2 2308062 \n", + "2031 1.0 1.0 40.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2032 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2033 NaN NaN NaN ... Pageable NVIDIA H100 (2) 2 2308062 \n", + "2034 1.0 1.0 45.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2035 1.0 1.0 36.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2036 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2037 1.0 1.0 93.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2038 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2039 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2040 NaN NaN NaN ... Device NVIDIA H100 (2) 2 2308062 \n", + "2041 NaN NaN NaN ... Device NVIDIA H100 (2) 2 2308062 \n", + "2042 NaN NaN NaN ... Device NVIDIA H100 (2) 2 2308062 \n", + "\n", + " Ctx GreenCtx Strm Name \\\n", + "2023 1 None 16 [CUDA memcpy Device-to-Host] \n", + "2024 1 None 16 mod_solver_conjgrad_imex_245_gpu \n", + "2025 1 None 16 mod_bc_routines_bc_fix_dirichlet_residual_293_gpu \n", + "2026 1 None 16 [CUDA memset] \n", + "2027 1 None 16 mod_solver_conjgrad_imex_271_gpu \n", + "2028 1 None 16 mod_solver_conjgrad_imex_271_gpu__red \n", + "2029 1 None 16 [CUDA memcpy Device-to-Host] \n", + "2030 1 None 16 [CUDA memset] \n", + "2031 1 None 16 mod_solver_conjgrad_imex_297_gpu \n", + "2032 1 None 16 mod_solver_conjgrad_imex_297_gpu__red \n", + "2033 1 None 16 [CUDA memcpy Device-to-Host] \n", + "2034 1 None 16 mod_solver_conjgrad_imex_307_gpu \n", + "2035 1 None 16 elem_diffu_full_diffusion_ijk_51_gpu \n", + "2036 1 None 16 elem_diffu_full_diffusion_ijk_52_gpu \n", + "2037 1 None 16 elem_diffu_full_diffusion_ijk_60_gpu \n", + "2038 1 None 16 mod_comms_fill_sendbuffer_real_239_gpu \n", + "2039 1 None 16 mod_comms_fill_sendbuffer_real_246_gpu \n", + "2040 1 None 17 [CUDA memcpy Peer-to-Peer] \n", + "2041 1 None 19 [CUDA memcpy Peer-to-Peer] \n", + "2042 1 None 18 [CUDA memcpy Peer-to-Peer] \n", + "\n", + " thread task \n", + "2023 2 2 \n", + "2024 2 2 \n", + "2025 2 2 \n", + "2026 2 2 \n", + "2027 2 2 \n", + "2028 2 2 \n", + "2029 2 2 \n", + "2030 2 2 \n", + "2031 2 2 \n", + "2032 2 2 \n", + "2033 2 2 \n", + "2034 2 2 \n", + "2035 2 2 \n", + "2036 2 2 \n", + "2037 2 2 \n", + "2038 2 2 \n", + "2039 2 2 \n", + "2040 3 2 \n", + "2041 5 2 \n", + "2042 4 2 \n", "\n", - " isGreenContext \n", - "0 0 " + "[20 rows x 25 columns]" ] }, - "execution_count": 7, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n", - "context_info" + "kernels_df.iloc[16000:16020]" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 5b3f16037a220e03d9a328eb0eb4e795df96cc38..d0827c318d5ac718dced6b568a287fbdc46152da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,13 @@ [tool.poetry] name = "nsys2prv" -version = "0.3.1" +version = "0.4.0-dev20241007" description = "Translate a NVIDIA Nsight System trace to a Paraver trace" authors = ["Marc Clascà "] readme = "README.md" license = "GPL-3.0-only" include = [ - "cfgs" + "cfgs", + "docs" ] repository = "https://pm.bsc.es/gitlab/beppp/nsys2prv" homepage = "https://pm.bsc.es/gitlab/beppp/nsys2prv"