From f9057840bfade6f7292dfb67d8eea7e286eaf3b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Fri, 20 Sep 2024 17:39:20 +0200 Subject: [PATCH 1/9] Adds CLI support for multinode. Adds logic to export data for multiple reports. To do this, logic to interact with the nsys binary has been refactored to a new class. --- nsys2prv/NSYSInterface.py | 60 ++++++++++++++++++++++++++ nsys2prv/parse_nsys_stats.py | 81 +++++++++++++----------------------- 2 files changed, 89 insertions(+), 52 deletions(-) create mode 100644 nsys2prv/NSYSInterface.py diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py new file mode 100644 index 0000000..29a895c --- /dev/null +++ b/nsys2prv/NSYSInterface.py @@ -0,0 +1,60 @@ +import subprocess +import os + +class NSYSInterface(): + + def __init__(self, types, filter_nvtx, range_nvtx, force_sqlite): + self.use_path = True + self.nsys_binary = ("nsys",) + + if 'NSYS_HOME' in os.environ: + self.NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME')) + self.use_path = False + + if self.use_path: + self.nsys_binary = ("nsys",) + else: + self.nsys_binary = (os.path.join(self.NSYS_HOME, "bin/nsys"),) + + self.types = types + self.filter = filter_nvtx + self.range_nvtx = range_nvtx + self.force = force_sqlite + + def check_export_report(self, rf): + if not os.path.exists(f"{os.path.splitext(os.path.basename(rf))[0]}.sqlite") or self.force: + #Try exporting first + export_call = self.nsys_binary + ("export", "-t", "sqlite", rf) + try: + with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: + for line in p.stdout: + print(line.decode(), end='') + + if p.returncode != 0: + raise ChildProcessError(p.returncode, p.args) + except FileNotFoundError: + print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") + exit(1) + except ChildProcessError: + print("Could not export SQLite database. Exiting.") + exit(1) + + def call_stats(self, report): + nsys_call = self.nsys_binary + ("stats", "-r", ",".join(self.types), + "--timeunit", "nsec", "-f", "csv", + "--force-overwrite", "true", "-o", ".") + if self.filter: + nsys_call += ("--filter-nvtx="+self.range_nvtx,) + + nsys_call += (report,) + + try: + with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: + for line in p.stdout: + print(line.decode(), end='') + + if p.returncode != 0: + raise ChildProcessError(p.returncode, p.args) + except FileNotFoundError: + print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") + exit(1) \ No newline at end of file diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 8a25660..01882df 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -11,9 +11,9 @@ import locale from sqlalchemy import create_engine, text, dialects from sqlalchemy.exc import OperationalError from .EventWriter import event_writer as ewr +from .NSYSInterface import NSYSInterface from .semantics.mpi_event_encoding import * - def main(): locale.setlocale(locale.LC_ALL, '') @@ -31,6 +31,7 @@ def main(): parser.add_argument("-v", "--version", nargs=0, help="Show version and exit.", action=ShowVersion) parser.add_argument("-f", "--filter-nvtx", help="Filter by this NVTX range") parser.add_argument("-t", "--trace", help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, gpu_metrics, openacc]") + parser.add_argument("-m", "--multi-report", action="store_true", help="Translate multiple reports of the same execution into one trace.") parser.add_argument("--force-sqlite", action="store_true", help="Force Nsight System to export SQLite database") @@ -39,23 +40,24 @@ def main(): #parser.add_argument("-n", "--nvtx-stack-range", nargs=2, type=int) - parser.add_argument("source_rep", help="Nsight source report file") + parser.add_argument("source_rep", nargs="+", help="Nsight source report file") parser.add_argument("output", help="Paraver output trace name") args = parser.parse_args() # # Trace configuration and setup - - use_path = True - - if 'NSYS_HOME' in os.environ: - NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME')) - use_path = False PARAVER_HOME = os.getenv('PARAVER_HOME') - REPORT_FILE = os.path.abspath(args.source_rep) - REPORT_DIR = os.path.dirname(REPORT_FILE) + MULTIREPORT = args.multi_report + if MULTIREPORT: + REPORTS_LIST = [os.path.abspath(x) for x in args.source_rep] + REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST] + REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report + else: + REPORT_FILE = os.path.abspath(args.source_rep.first) + REPORT_DIR = os.path.dirname(REPORT_FILE) + trace_name = args.output NVTX_FILTER = args.filter_nvtx != None @@ -135,28 +137,18 @@ def main(): return os.path.join(REPORT_DIR, base_name+"_{}.csv".format(report_name)) + nsi = NSYSInterface(reports, NVTX_FILTER, NVTX_RANGE, args.force_sqlite) + + if MULTIREPORT: + print(f"Multiple reports provided: {REPORTS_LIST}") print("Extracting reports for: {}".format(reports_og)) - if use_path: - nsys_binary = ("nsys",) - else: - nsys_binary = (os.path.join(NSYS_HOME, "bin/nsys"),) - if not os.path.exists(f"{os.path.splitext(os.path.basename(REPORT_FILE))[0]}.sqlite"): - #Try exporting first - export_call = nsys_binary + ("export", "-t", "sqlite", REPORT_FILE) - try: - with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: - for line in p.stdout: - print(line.decode(), end='') - - if p.returncode != 0: - raise ChildProcessError(p.returncode, p.args) - except FileNotFoundError: - print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") - exit(1) - except ChildProcessError: - print("Could not export SQLite database. Exiting.") - exit(1) + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + print(f"Exporting SQLite databse for {os.path.basename(REPORT_FILE_I)}") + nsi.check_export_report(REPORT_FILE_I) + else: + nsi.check_export_report(REPORT_FILE) engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") metadata = pd.read_sql_table("META_DATA_EXPORT", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") @@ -164,29 +156,14 @@ def main(): if int(minor_version["value"].iloc[0]) > 11: print(f"\033[93m Warning! The SQLite schema version {int(minor_version["value"].iloc[0])} is greater than the one supported (11). If unexpected behaviour occurs, please report it. \033[00m") - nsys_call = nsys_binary + ("stats", "-r", ",".join(reports), - "--timeunit", "nsec", "-f", "csv", - "--force-overwrite", "true", "-o", ".") - - if NVTX_FILTER: - nsys_call += ("--filter-nvtx="+NVTX_RANGE,) - - if args.force_sqlite: - nsys_call += ("--force-export", "true") - - nsys_call += (REPORT_FILE,) - - try: - with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p: - for line in p.stdout: - print(line.decode(), end='') - - if p.returncode != 0: - raise ChildProcessError(p.returncode, p.args) - except FileNotFoundError: - print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") - exit(1) + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + print(f"Processing stats for {os.path.basename(REPORT_FILE_I)}") + nsi.call_stats(REPORT_FILE_I) + else: + nsi.call_stats(REPORT_FILE) + assert(False) print("Importing datasets...") # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace")) -- GitLab From 7e8dc181eb6776dbbd8d085aeaf9d4ecb8093873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Mon, 23 Sep 2024 18:59:30 +0200 Subject: [PATCH 2/9] Collection of events dataframes now is offloaded to custom classes that contain a setup function for SQL data loading and data preprocessing. Added collection of events for multiple reports (WIP). --- nsys2prv/NSYSInterface.py | 9 +- nsys2prv/parse_nsys_stats.py | 220 ++++++++++++------- nsys2prv/semantics/__init__.py | 4 + nsys2prv/semantics/gpu_metrics_semantic.py | 28 +++ nsys2prv/semantics/kernels_semantic.py | 12 + nsys2prv/semantics/mpi_semantic.py | 78 +++++++ nsys2prv/semantics/nsys_event.py | 68 ++++++ nsys2prv/semantics/nvtx_startend_semantic.py | 11 + 8 files changed, 345 insertions(+), 85 deletions(-) create mode 100644 nsys2prv/semantics/__init__.py create mode 100644 nsys2prv/semantics/gpu_metrics_semantic.py create mode 100644 nsys2prv/semantics/kernels_semantic.py create mode 100644 nsys2prv/semantics/mpi_semantic.py create mode 100644 nsys2prv/semantics/nsys_event.py create mode 100644 nsys2prv/semantics/nvtx_startend_semantic.py diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py index 29a895c..9034cf7 100644 --- a/nsys2prv/NSYSInterface.py +++ b/nsys2prv/NSYSInterface.py @@ -57,4 +57,11 @@ class NSYSInterface(): raise ChildProcessError(p.returncode, p.args) except FileNotFoundError: print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.") - exit(1) \ No newline at end of file + exit(1) + + def build_nsys_stats_name(self, rf, rd, report_name): + base_name = os.path.splitext(os.path.basename(rf))[0] + if self.filter: + return os.path.join(rd, base_name+"_{}_nvtx={}.csv".format(report_name, self.range_nvtx)) + else: + return os.path.join(rd, base_name+"_{}.csv".format(report_name)) \ No newline at end of file diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 01882df..5d108e0 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -13,6 +13,7 @@ from sqlalchemy.exc import OperationalError from .EventWriter import event_writer as ewr from .NSYSInterface import NSYSInterface from .semantics.mpi_event_encoding import * +from .semantics import * def main(): locale.setlocale(locale.LC_ALL, '') @@ -128,15 +129,6 @@ def main(): nvtx_stack_top = 1 nvtx_stack_bottom = 4 - - def build_nsys_stats_name(report_name): - base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0] - if NVTX_FILTER: - return os.path.join(REPORT_DIR, base_name+"_{}_nvtx={}.csv".format(report_name, NVTX_RANGE)) - else: - return os.path.join(REPORT_DIR, base_name+"_{}.csv".format(report_name)) - - nsi = NSYSInterface(reports, NVTX_FILTER, NVTX_RANGE, args.force_sqlite) if MULTIREPORT: @@ -163,103 +155,139 @@ def main(): else: nsi.call_stats(REPORT_FILE) - assert(False) + #assert(False) print("Importing datasets...") # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace")) # kernels_df.rename(columns={"CorrId": "CorrID"}, inplace=True) - with engine.connect() as conn, conn.begin(): - with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query: - kernels_df = pd.read_sql_query(text(query.read()), conn) + # with engine.connect() as conn, conn.begin(): + # with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query: + # kernels_df = pd.read_sql_query(text(query.read()), conn) + + kernels_df = [] + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + ksi = KernelsSemantic(REPORT_FILE_I) + ksi.Setup() + ksi.load_data() + kernels_df.append(ksi.get_df()) + del ksi + else: + ks = KernelsSemantic(REPORT_FILE) + ks.Setup() + ks.load_data() + kernels_df = ks.get_df() if t_apicalls: - cuda_api_df = pd.read_csv(build_nsys_stats_name("cuda_api_trace")) + cuda_api_df = [] + if MULTIREPORT: + for i, REPORT_FILE_I in enumerate(REPORTS_LIST): + cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "cuda_api_trace"))) + else: + cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "cuda_api_trace")) else: cuda_api_df = pd.DataFrame() if t_nvtx: - nvtx_df = pd.read_csv(build_nsys_stats_name("nvtx_pushpop_trace")) - nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0] + if MULTIREPORT: + for i, REPORT_FILE_I in enumerate(REPORTS_LIST): + nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace"))) + nvtx_df[i]["domain"] = nvtx_df["Name"].str.split(":").str[0] + else: + nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace")) + nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0] else: nvtx_df = pd.DataFrame() if t_nvtx_startend: - with engine.connect() as conn, conn.begin(): - with open(os.path.join(os.path.dirname(__file__), 'scripts/nvtx_startend_trace.sql'), 'r') as query: - nvtx_startend_df = pd.read_sql_query(text(query.read()), conn) + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + ksi = NVTXStartEndSemantic(REPORT_FILE_I) + ksi.Setup() + ksi.load_data() + nvtx_startend_df.append(ksi.get_df()) + del ksi + else: + ks = NVTXStartEndSemantic(REPORT_FILE) + ks.Setup() + ks.load_data() + nvtx_startend_df = ks.get_df() + del ks else: nvtx_startend_df = pd.DataFrame() if t_mpi: - with engine.connect() as conn, conn.begin(): - try: - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_p2p.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_P2P_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_START_WAIT_EVENTS'): - mpi_p2p_df = pd.read_sql_query(text(query.read()), conn) - mpi_p2p_df["event_type"] = MPITYPE_PTOP - else: mpi_p2p_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_coll.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'): - mpi_coll_df = pd.read_sql_query(text(query.read()), conn) - mpi_coll_df = mpi_coll_df.drop(mpi_coll_df[mpi_coll_df["Event"].str.contains("File") ].index) - mpi_coll_df["event_type"] = MPITYPE_COLLECTIVE - else: mpi_coll_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'): - mpi_other_df = pd.read_sql_query(text(query.read()), conn) - mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("File") ].index) - mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index) - mpi_other_df["event_type"] = MPITYPE_OTHER - else: mpi_other_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'): - mpi_rma_df = pd.read_sql_query(text(query.read()), conn) - mpi_rma_df = mpi_rma_df[mpi_rma_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")] - mpi_rma_df["event_type"] = MPITYPE_RMA - else: mpi_rma_df = pd.DataFrame() - with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_io.sql'), 'r') as query: - if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'): - mpi_io_df = pd.read_sql_query(text(query.read()), conn) - mpi_io_df = mpi_io_df[mpi_io_df["Event"].str.contains("File")] - mpi_io_df["event_type"] = MPITYPE_IO - else: mpi_io_df = pd.DataFrame() - mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df]) - except OperationalError as oe: - print("There has been a problem fetching MPI information. MPI data will be skipped.") - print(f"[ERROR]: {oe.detail}") - t_mpi = False - #mpi_df = pd.read_csv(build_nsys_stats_name("mpi_event_trace")) + mpi_df = [] + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + kp2pi = MPIP2PSemantic(REPORT_FILE_I) + kp2pi.Setup() + kp2pi.load_data() + + kcolli = MPICollSemantic(REPORT_FILE_I) + kcolli.Setup() + kcolli.load_data() + + kotheri = MPIOtherSemantic(REPORT_FILE_I) + kotheri.Setup() + kotheri.load_data() + + krmai = MPIRMASemantic(REPORT_FILE_I) + krmai.Setup() + krmai.load_data() + + kioi = MPIIOPSemantic(REPORT_FILE_I) + kioi.Setup() + kioi.load_data() + + mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()])) + del kp2pi, kcolli, kotheri, krmai, kioi + else: + kmpi = MPIP2PSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_p2p_df = kmpi.get_df() + + kmpi = MPICollSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_coll_df = kmpi.get_df() + + kmpi = MPIOtherSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_other_df = kmpi.get_df() + + kmpi = MPIRMASemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_rma_df = kmpi.get_df() + + kmpi = MPIIOPSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_io_df = kmpi.get_df() + mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df]) + del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df else: - #mpi_df = pd.DataFrame() - mpi_p2p_df = pd.DataFrame() - mpi_coll_df = pd.DataFrame() - mpi_other_df = pd.DataFrame() - mpi_rma_df = pd.DataFrame() - mpi_io_df = pd.DataFrame() - - # Obtain context Info - context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") - if t_mpi: - mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;" - with engine.connect() as conn, conn.begin(): - rank_info = pd.read_sql_query(mpi_query, conn) - - context_info.sort_values(["processId"], inplace=True) + mpi_df = pd.DataFrame() + gpu_metrics_agg = [] if t_metrics: - gpu_metrics = pd.read_sql_table("GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") - metrics_description = pd.read_sql_table("TARGET_INFO_GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") - gpu_metrics.drop(gpu_metrics[gpu_metrics["timestamp"] < 0].index, inplace=True) # drop negative time - metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index() - metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base - #gpu_metrics["task"] = gpu_metrics.groupby(["typeId"]).ngroup() + 1 - gpu_metrics["deviceId"] = gpu_metrics["typeId"].apply(lambda x: x & 0xFF) - gpu_metrics_agg = gpu_metrics.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base), - 'value': lambda x: list(x), - 'deviceId': 'first'}) - gpu_metrics_agg.reset_index(inplace=True) - + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + ksi = GPUMetricsSemantic(REPORT_FILE_I) + ksi.Setup() + ksi.load_data() + gpu_metrics_agg.append(ksi.get_df()) + del ksi + else: + ks = GPUMetricsSemantic(REPORT_FILE) + ks.Setup() + ks.load_data() + gpu_metrics_agg = ks.get_df() + del ks if t_openacc: with engine.connect() as conn, conn.begin(): @@ -272,6 +300,30 @@ def main(): openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn) + # Obtain context Info + list_contexts = [] + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + list_contexts.append(context_info_i) + context_info = pd.concat(list_contexts) + print(context_info) + else: + context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") + context_info.sort_values(["processId"], inplace=True) + + if t_mpi: + if MULTIREPORT: + list_ranks = [] + for REPORT_FILE_I in REPORTS_LIST: + mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;" + engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + with engine.connect() as conn, conn.begin(): + list_ranks.append(pd.read_sql_query(mpi_query, conn)) + rank_info = pd.concat(list_ranks) + print(rank_info) + + assert(False) # # Building object model # ## Tasks and threads diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py new file mode 100644 index 0000000..460fd05 --- /dev/null +++ b/nsys2prv/semantics/__init__.py @@ -0,0 +1,4 @@ +from .kernels_semantic import KernelsSemantic +from .mpi_semantic import * +from .nvtx_startend_semantic import NVTXStartEndSemantic +from .gpu_metrics_semantic import GPUMetricsSemantic \ No newline at end of file diff --git a/nsys2prv/semantics/gpu_metrics_semantic.py b/nsys2prv/semantics/gpu_metrics_semantic.py new file mode 100644 index 0000000..097682a --- /dev/null +++ b/nsys2prv/semantics/gpu_metrics_semantic.py @@ -0,0 +1,28 @@ +from .nsys_event import NsysEvent +from pandas import read_sql_table +from sqlalchemy import text + +event_type_metrics_base = 9400 + + +class GPUMetricsSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("GPU_METRICS"): + self.query = text("SELECT * FROM GPU_METRICS") + else: + self._empty = True + + def _preprocess(self): + metrics_description = read_sql_table("TARGET_INFO_GPU_METRICS", self._dbcon) + self._df.drop(self._df[self._df["timestamp"] < 0].index, inplace=True) # drop negative time + metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index() + metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base + self._df["deviceId"] = self._df["typeId"].apply(lambda x: x & 0xFF) + self._df = self._df.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base), + 'value': lambda x: list(x), + 'deviceId': 'first'}) + self._df.reset_index(inplace=True) + return super()._preprocess() \ No newline at end of file diff --git a/nsys2prv/semantics/kernels_semantic.py b/nsys2prv/semantics/kernels_semantic.py new file mode 100644 index 0000000..804128e --- /dev/null +++ b/nsys2prv/semantics/kernels_semantic.py @@ -0,0 +1,12 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class KernelsSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/kernels.sql'), 'r') as query: + self.query = text(query.read()) + \ No newline at end of file diff --git a/nsys2prv/semantics/mpi_semantic.py b/nsys2prv/semantics/mpi_semantic.py new file mode 100644 index 0000000..a476376 --- /dev/null +++ b/nsys2prv/semantics/mpi_semantic.py @@ -0,0 +1,78 @@ +from .nsys_event import NsysEvent +import os.path +from .mpi_event_encoding import * +from sqlalchemy import text + +class MPIP2PSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table('MPI_P2P_EVENTS') and self.check_table('MPI_START_WAIT_EVENTS'): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_p2p.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + def _preprocess(self): + self._df["event_type"] = MPITYPE_PTOP + return super()._preprocess() + +class MPICollSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_COLLECTIVES_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_coll.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + + def _preprocess(self): + self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index) + self._df["event_type"] = MPITYPE_COLLECTIVE + +class MPIOtherSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_OTHER_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + + def _preprocess(self): + self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index) + self._df = self._df.drop(self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index) + self._df["event_type"] = MPITYPE_OTHER + +class MPIRMASemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_OTHER_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + def _preprocess(self): + self._df = self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")] + self._df["event_type"] = MPITYPE_RMA + +class MPIIOPSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + if self.check_table("MPI_OTHER_EVENTS") and self.check_table("MPI_COLLECTIVES_EVENTS"): + with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_io.sql'), 'r') as query: + self.query = text(query.read()) + else: + self._empty = True + + def _preprocess(self): + self._df = self._df[self._df["Event"].str.contains("File")] + self._df["event_type"] = MPITYPE_IO \ No newline at end of file diff --git a/nsys2prv/semantics/nsys_event.py b/nsys2prv/semantics/nsys_event.py new file mode 100644 index 0000000..9813a14 --- /dev/null +++ b/nsys2prv/semantics/nsys_event.py @@ -0,0 +1,68 @@ +from sqlalchemy import create_engine, exc, inspect +import pandas as pd +import os.path + +class NsysEvent: + + class MissingDatabaseFile(Exception): + def __init__(self, filename): + super().__init__(f'Database file {filename} does not exist.') + + class InvalidDatabaseFile(Exception): + def __init__(self, filename): + super().__init__(f'Database file {filename} could not be opened and appears to be invalid.') + + class InvalidSQL(Exception): + def __init__(self, sql): + super().__init__(f'Bad SQL statement: {sql}') + + query = "SELECT 1 AS 'ONE'" + + def __init__(self, report) -> None: + self._dbcon = None + self._dbfile = f"{os.path.splitext(report)[0]}.sqlite" + self._df = pd.DataFrame() + self._empty = False + + if not os.path.exists(self._dbfile): + raise self.MissingDatabaseFile(self._dbfile) + + try: + self._dbcon = create_engine(f"sqlite:///{self._dbfile}") + except exc.SQLAlchemyError: + self._dbcon = None + raise self.InvalidDatabaseFile(self._dbfile) + + def check_table(self, table_name): + insp = inspect(self._dbcon) + return insp.has_table(table_name) + + def Setup(self): + pass + + def _preprocess(self): + pass + + def postprocess(self): + pass + + def load_data(self): + if not self._empty: + try: + self._df = pd.read_sql_query(self.query, self._dbcon) + except pd.errors.DatabaseError: + raise self.InvalidSQL(self.query) + self._preprocess() + + def apply_process_model(self, threads=pd.DataFrame, streams=pd.DataFrame): + self.df["thread"] = self.df["Tid"].map(threads.set_index('Tid')["thread"]) + self.df["task"] = self.df["Tid"].map(threads.set_index('Tid')["task"]) + if 'Rank' in threads.columns: + self.df["Rank"] = self.df["Tid"].map(threads.set_index('Tid')["Rank"]) + pass + + def get_threads(self): + return self._df[['Pid', 'Tid']].drop_duplicates() + + def get_df(self): + return self._df.copy() \ No newline at end of file diff --git a/nsys2prv/semantics/nvtx_startend_semantic.py b/nsys2prv/semantics/nvtx_startend_semantic.py new file mode 100644 index 0000000..88241b7 --- /dev/null +++ b/nsys2prv/semantics/nvtx_startend_semantic.py @@ -0,0 +1,11 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class NVTXStartEndSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_startend_trace.sql'), 'r') as query: + self.query = text(query.read()) \ No newline at end of file -- GitLab From e512afd58ee86209f6153b912d2c8810d244c6a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Wed, 25 Sep 2024 16:56:26 +0200 Subject: [PATCH 3/9] Merges dataframes of multiple reports for different events into single one. Adapts process model construction to support multiple devices on different machines with same device ids. PROVISIONAL TRANSLATION OF MULTIREPORT, MULTINODE TRACE WORKING. --- nsys2prv/parse_nsys_stats.py | 56 ++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 5d108e0..4368380 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -194,9 +194,13 @@ def main(): for i, REPORT_FILE_I in enumerate(REPORTS_LIST): nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace"))) nvtx_df[i]["domain"] = nvtx_df["Name"].str.split(":").str[0] + nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + else: nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace")) nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0] + nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) + else: nvtx_df = pd.DataFrame() @@ -307,11 +311,14 @@ def main(): context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") list_contexts.append(context_info_i) context_info = pd.concat(list_contexts) - print(context_info) else: context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") context_info.sort_values(["processId"], inplace=True) + ## CONTEXT INFO CHECK FOR MULTIREPORT + if context_info["deviceId"].unique().size == 0: + print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \033[00m") + if t_mpi: if MULTIREPORT: list_ranks = [] @@ -321,9 +328,28 @@ def main(): with engine.connect() as conn, conn.begin(): list_ranks.append(pd.read_sql_query(mpi_query, conn)) rank_info = pd.concat(list_ranks) - print(rank_info) - assert(False) + + # MERGING AND SYNCHRONIZATION + if MULTIREPORT: + kernels_df = pd.concat(kernels_df) + if t_apicalls: + cuda_api_df = pd.concat(cuda_api_df) + + if t_nvtx: + nvtx_df = pd.concat(nvtx_df) + + if t_nvtx_startend: + nvtx_startend_df = pd.concat(nvtx_startend_df) + + if t_mpi: + mpi_df = pd.concat(mpi_df) + + #if t_metrics: + + #if t_openacc: + + # # Building object model # ## Tasks and threads @@ -335,7 +361,6 @@ def main(): if t_mpi: print("MPI calls unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique())) if t_openacc: print("OpenACC calls unique processes: {}, and unique threads: {}".format(openacc_other_df["Pid"].unique(), openacc_other_df["Tid"].unique())) - if t_nvtx: nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) compute_threads_with = [] if t_apicalls: compute_threads_with.append(cuda_api_df[['Pid', 'Tid']]) @@ -361,6 +386,7 @@ def main(): 'Tid': lambda x: set(x), 'thread': 'count', 'device': 'first' }) + print(tasks_set) cuda_api_df["thread"] = 0 cuda_api_df["task"] = 0 @@ -418,41 +444,44 @@ def main(): streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates() - streams["thread"] = streams.groupby(["Device"]).cumcount() + 1 + streams["thread"] = streams.groupby(["Pid", "Device"]).cumcount() + 1 #streams["deviceid"] = streams.sort_values("Device").groupby(["Device"]).ngroup() #streams["Pid"] = streams["deviceid"].map(tasks_set.set_index("device")["Pid"]) - streams["task"] = streams["deviceid"].map(tasks_set.reset_index().set_index("device")["task"]) + streams["task"] = streams["Pid"].map(tasks_set.reset_index().set_index("Pid")["task"]) streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str) num_streams = streams.count().iloc[0] streams.sort_values(["Pid", "thread"], inplace=True) streams.reset_index(inplace=True) - devices_set = streams.groupby(["deviceid"]).agg({'Device': 'first', + devices_set = streams.groupby(["Pid", "deviceid"]).agg({'Device': 'first', 'Strm': lambda x: set(x), 'thread': 'count', 'task': 'first', 'Pid': 'last'}) + print(devices_set) # Here we finally update the threadId we are going to put in the event record of kernel executions to respect the normal threads before CUDA streams num_normal_threads = tasks_set['thread'] num_normal_threads_repeated = num_normal_threads.repeat(devices_set["thread"]).reset_index()[["thread"]] - streams['thread'] = streams['thread'] + num_normal_threads_repeated["thread"] + # for index,row in kernels_df.iterrows(): # kernels_df.at[index, "thread"] = streams.at[(streams["Strm"] == row["Strm"]).idxmax(), "thread"] # kernels_df.at[index, "deviceid"] = streams.at[(streams["Device"] == row["Device"]).idxmax(), "deviceid"] # More efficient way by chatgpt # First, let's filter streams DataFrame based on conditions - filtered_streams = streams.groupby(["Device", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index() + filtered_streams = streams.groupby(["Pid", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index() # Now, merge the filtered streams DataFrame with kernels_df - result_df = kernels_df.merge(filtered_streams, how='left', on=['Device', 'Strm']) + result_df = kernels_df.merge(filtered_streams, how='left', on=["Pid", 'Strm']) + # Copy the results back to kernels_df - kernels_df['thread'] = result_df['thread'] - kernels_df['task'] = result_df['task'] + kernels_df['thread'] = result_df['thread'].to_numpy() + kernels_df['task'] = result_df['task'].to_numpy() + # Add auxiliary stream to streams dataframe if t_metrics: @@ -470,9 +499,6 @@ def main(): # ## Writing ROW file # Now we can write the _row_ file with this information - print(tasks_set) - print(devices_set) - print(" -Writing resource model to row file...") row_df = pd.concat([threads[["thread", "task", "row_name"]], streams[["thread", "task", "row_name"]]]) -- GitLab From 1dc011a98bc236cf4900c13920d3304e7a07a88b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Fri, 27 Sep 2024 13:39:35 +0200 Subject: [PATCH 4/9] Solves problem that removed some kernels when merging different reports. Now concatenating dataframes without indexes. --- nsys2prv/parse_nsys_stats.py | 71 +++++--- parser-playground.ipynb | 310 ++++++++++++++++++++--------------- 2 files changed, 229 insertions(+), 152 deletions(-) diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 4368380..0490046 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -8,6 +8,7 @@ import time import subprocess import os import locale +from functools import reduce from sqlalchemy import create_engine, text, dialects from sqlalchemy.exc import OperationalError from .EventWriter import event_writer as ewr @@ -155,7 +156,7 @@ def main(): else: nsi.call_stats(REPORT_FILE) - #assert(False) + # MARK: IMPORT DATASETS print("Importing datasets...") # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace")) @@ -166,11 +167,13 @@ def main(): kernels_df = [] if MULTIREPORT: + sum = 0 for REPORT_FILE_I in REPORTS_LIST: ksi = KernelsSemantic(REPORT_FILE_I) ksi.Setup() ksi.load_data() kernels_df.append(ksi.get_df()) + sum += ksi.get_df().shape[0] del ksi else: ks = KernelsSemantic(REPORT_FILE) @@ -190,10 +193,11 @@ def main(): cuda_api_df = pd.DataFrame() if t_nvtx: + nvtx_df = [] if MULTIREPORT: for i, REPORT_FILE_I in enumerate(REPORTS_LIST): nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace"))) - nvtx_df[i]["domain"] = nvtx_df["Name"].str.split(":").str[0] + nvtx_df[i]["domain"] = nvtx_df[i]["Name"].str.split(":").str[0] nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True) else: @@ -205,6 +209,7 @@ def main(): nvtx_df = pd.DataFrame() if t_nvtx_startend: + nvtx_startend_df = [] if MULTIREPORT: for REPORT_FILE_I in REPORTS_LIST: ksi = NVTXStartEndSemantic(REPORT_FILE_I) @@ -245,7 +250,7 @@ def main(): kioi.Setup() kioi.load_data() - mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()])) + mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True)) del kp2pi, kcolli, kotheri, krmai, kioi else: kmpi = MPIP2PSemantic(REPORT_FILE) @@ -272,7 +277,7 @@ def main(): kmpi.Setup() kmpi.load_data() mpi_io_df = kmpi.get_df() - mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df]) + mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True) del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df else: mpi_df = pd.DataFrame() @@ -304,7 +309,7 @@ def main(): openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn) - # Obtain context Info + # MARK: CONTEXT INFO list_contexts = [] if MULTIREPORT: for REPORT_FILE_I in REPORTS_LIST: @@ -315,7 +320,7 @@ def main(): context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite") context_info.sort_values(["processId"], inplace=True) - ## CONTEXT INFO CHECK FOR MULTIREPORT + # CONTEXT INFO CHECK FOR MULTIREPORT if context_info["deviceId"].unique().size == 0: print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \033[00m") @@ -330,33 +335,57 @@ def main(): rank_info = pd.concat(list_ranks) - # MERGING AND SYNCHRONIZATION + # MARK: MERGING AND ALIGNING if MULTIREPORT: - kernels_df = pd.concat(kernels_df) + # Find delta between earliest trace start and the others + session_time = [] + for REPORT_FILE_I in REPORTS_LIST: + session_time.append(pd.read_sql_table("TARGET_INFO_SESSION_START_TIME", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")) + + session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs + earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf')) + deltas = [start - earliest_time for start in session_time] + for i, df in enumerate(kernels_df): + df['Start (ns)'] += deltas[i] + kernels_df = pd.concat(kernels_df, ignore_index=True) + print(f"After concat: {kernels_df.shape}") + if t_apicalls: - cuda_api_df = pd.concat(cuda_api_df) + for i, df in enumerate(cuda_api_df): + df['Start (ns)'] += deltas[i] + + cuda_api_df = pd.concat(cuda_api_df, ignore_index=True) if t_nvtx: - nvtx_df = pd.concat(nvtx_df) + for i, df in enumerate(nvtx_df): + df['Start (ns)'] += deltas[i] + df['End (ns)'] += deltas[i] + nvtx_df = pd.concat(nvtx_df, ignore_index=True) if t_nvtx_startend: - nvtx_startend_df = pd.concat(nvtx_startend_df) + for i, df in enumerate(nvtx_startend_df): + df['Start (ns)'] += deltas[i] + df['End (ns)'] += deltas[i] + nvtx_startend_df = pd.concat(nvtx_startend_df, ignore_index=True) if t_mpi: - mpi_df = pd.concat(mpi_df) + for i, df in enumerate(mpi_df): + df['Start:ts_ns'] += deltas[i] + df['End:ts_ns'] += deltas[i] + mpi_df = pd.concat(mpi_df, ignore_index=True) #if t_metrics: #if t_openacc: - - - # # Building object model + + + # MARK: PROCESS MODEL # ## Tasks and threads # Now, find unique appearences of ThreadID and ProcessID if t_apicalls: print("CUDA calls unique processes: {}, and unique threads: {}".format(cuda_api_df["Pid"].unique(), cuda_api_df["Tid"].unique())) - if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["PID"].unique(), nvtx_df["TID"].unique())) + if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["Pid"].unique(), nvtx_df["Tid"].unique())) if t_nvtx_startend: print("NVTX startend unique processes: {}, and unique threads: {}".format(nvtx_startend_df["Pid"].unique(), nvtx_startend_df["Tid"].unique())) if t_mpi: print("MPI calls unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique())) if t_openacc: print("OpenACC calls unique processes: {}, and unique threads: {}".format(openacc_other_df["Pid"].unique(), openacc_other_df["Tid"].unique())) @@ -482,7 +511,6 @@ def main(): kernels_df['thread'] = result_df['thread'].to_numpy() kernels_df['task'] = result_df['task'].to_numpy() - # Add auxiliary stream to streams dataframe if t_metrics: aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task"]] @@ -515,7 +543,7 @@ def main(): row_file.write("\n") - # # Collecting event values + # MARK: EVENT NAMES # Second step is collect all different event values for CUDA API calls, kernel names, and NVTX ranges. Each of these define a different event type, and will need unique identifiers to be used as a event values. Finally these needs to be dumped to the PCF file. print("Collecting event names and information...") @@ -545,7 +573,7 @@ def main(): kernel_names["Name"] = kernel_names["Name"].apply(lambda x: x.replace("[", "").replace("]", "")) if t_nvtx: - nvtx_df_subset = nvtx_df + nvtx_df_subset = nvtx_df.reset_index() lower_level = max(nvtx_df["Lvl"]) if nvtx_select_frames: @@ -846,9 +874,9 @@ GRADIENT_NAMES pcf_file.write("{} {}\n".format(row["func_value"], row["func"])) pcf_file.write("\n") + # MARK: MEMORY # # Split of kernel execution between compute and memory - memops_names = ["[CUDA memcpy Device-to-Device]", "[CUDA memcpy Device-to-Host]", "[CUDA memcpy Host-to-Device]", "[CUDA memset]", "[CUDA memcpy Peer-to-Peer]"] memops_df = kernels_df.loc[kernels_df["Name"].isin(memops_names)] mask = ~kernels_df.index.isin(memops_df.index) @@ -860,6 +888,7 @@ GRADIENT_NAMES comm_memory_df = cuda_api_df.merge(memops_df, how="inner", left_on=["CorrID", "task"], right_on=["CorrID", "task"], suffixes=("_call", "_mem"), validate="one_to_one") + # MARK: TIMELINE RECONS # # Timeline reconstruction print("Reconstructing timeline...") @@ -979,6 +1008,8 @@ GRADIENT_NAMES print(f"Congratulations! Trace {trace_name}.prv correctly translated.") + + # MARK: POSTPROCESSING # ## Postprocessing # - Reorder trace # - GZip trace diff --git a/parser-playground.ipynb b/parser-playground.ipynb index 10a2412..cb91839 100644 --- a/parser-playground.ipynb +++ b/parser-playground.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -14,6 +14,10 @@ "import locale\n", "import sqlite3\n", "from sqlalchemy import create_engine, text\n", + "from nsys2prv.EventWriter import event_writer as ewr\n", + "from nsys2prv.NSYSInterface import NSYSInterface\n", + "from nsys2prv.semantics.mpi_event_encoding import *\n", + "from nsys2prv.semantics import *\n", "\n", "NSYS_HOME = os.path.abspath(\"/home/mclasca/Apps/nsight-system/2024.5.1/\")\n", "#NSIGHT_HOME = os.getenv('NSIGHT_HOME')\n", @@ -27,6 +31,12 @@ "#REPORT_NAME=\"heka-step53+accum1-profile-2023.4-5721957\"\n", "#REPORT_NAME=\"heka-axolotl-Mistral7B0.1-profile-2110598\"\n", "\n", + "MULTIREPORT = True\n", + "if MULTIREPORT:\n", + " REPORTS_LIST = [os.path.abspath(x) for x in [\"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_0.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_1.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_2.nsys-rep\"]]\n", + " REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]\n", + " REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report\n", + "\n", "locale.setlocale(locale.LC_ALL, '')\n", "\n", "trace_name = \"test-heka\"\n", @@ -69,6 +79,7 @@ "nvtx_stack_bottom = 4\n", "\n", "reports = [\"cuda_api_trace\", \"cuda_gpu_trace\"]\n", + "nsi = NSYSInterface(reports, False, NVTX_RANGE, False)\n", "\n", "def build_nsys_stats_name(report_name):\n", " base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]\n", @@ -684,6 +695,43 @@ " raise ChildProcessError(p.returncode, p.args)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cuda_api_df = []\n", + "if MULTIREPORT:\n", + " for i, REPORT_FILE_I in enumerate(REPORTS_LIST):\n", + " cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], \"cuda_api_trace\")))\n", + "else:\n", + " cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, \"cuda_api_trace\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "kernels_df = []\n", + "if MULTIREPORT:\n", + " sum = 0\n", + " for REPORT_FILE_I in REPORTS_LIST:\n", + " ksi = KernelsSemantic(REPORT_FILE_I)\n", + " ksi.Setup()\n", + " ksi.load_data()\n", + " kernels_df.append(ksi.get_df())\n", + " sum += ksi.get_df().shape[0]\n", + " del ksi\n", + "else:\n", + " ks = KernelsSemantic(REPORT_FILE)\n", + " ks.Setup()\n", + " ks.load_data()\n", + " kernels_df = ks.get_df()" + ] + }, { "cell_type": "code", "execution_count": 8, @@ -711,7 +759,7 @@ " \n", " \n", " Start (ns)\n", - " Duration (ns)\n", + " Duration:dur_ns\n", " CorrID\n", " GrdX\n", " GrdY\n", @@ -721,12 +769,12 @@ " BlkZ\n", " Reg/Trd\n", " ...\n", - " DymSMem (MB)\n", - " Bytes (MB)\n", - " Throughput (MB/s)\n", + " Throughput:thru_B\n", " SrcMemKd\n", " DstMemKd\n", " Device\n", + " deviceid\n", + " Pid\n", " Ctx\n", " GreenCtx\n", " Strm\n", @@ -736,8 +784,8 @@ " \n", " \n", " 0\n", - " 1307261431\n", - " 992\n", + " 1271556323\n", + " 960\n", " 688\n", " NaN\n", " NaN\n", @@ -747,21 +795,21 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 32,258\n", + " 33333312.0\n", " Pageable\n", " Device\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Host-to-Device]\n", " \n", " \n", " 1\n", - " 1662416719\n", - " 960\n", + " 1626687401\n", + " 896\n", " 1285\n", " NaN\n", " NaN\n", @@ -771,20 +819,20 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 133,333\n", + " 142857088.0\n", " Pageable\n", " Device\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Host-to-Device]\n", " \n", " \n", " 2\n", - " 1662448494\n", + " 1626718377\n", " 736\n", " 1291\n", " NaN\n", @@ -795,21 +843,21 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 173,913\n", + " 173912960.0\n", " Pageable\n", " Device\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Host-to-Device]\n", " \n", " \n", " 3\n", - " 1662469326\n", - " 768\n", + " 1626741640\n", + " 704\n", " 1297\n", " NaN\n", " NaN\n", @@ -819,21 +867,21 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 166,667\n", + " 181818112.0\n", " Pageable\n", " Device\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Host-to-Device]\n", " \n", " \n", " 4\n", - " 1662702764\n", - " 704\n", + " 1626968327\n", + " 736\n", " 1327\n", " NaN\n", " NaN\n", @@ -843,14 +891,14 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 34,091\n", + " 38043460.0\n", " Pageable\n", " Device\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Host-to-Device]\n", " \n", @@ -879,10 +927,10 @@ " ...\n", " \n", " \n", - " 6645\n", - " 11824673327\n", - " 1378516\n", - " 68613\n", + " 7317\n", + " 11788939184\n", + " 1375193\n", + " 78406\n", " 65535.0\n", " 1.0\n", " 1.0\n", @@ -891,22 +939,22 @@ " 1.0\n", " 36.0\n", " ...\n", - " 0,001\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NVIDIA H100 (1)\n", + " None\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " mod_time_ops_adapt_dt_cfl_32_gpu\n", " \n", " \n", - " 6646\n", - " 11826052707\n", - " 99903\n", - " 68614\n", + " 7318\n", + " 11790315529\n", + " 99840\n", + " 78407\n", " 3.0\n", " 1.0\n", " 1.0\n", @@ -915,22 +963,22 @@ " 1.0\n", " 18.0\n", " ...\n", - " 0,001\n", - " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NVIDIA H100 (1)\n", + " None\n", + " None\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " mod_time_ops_adapt_dt_cfl_32_gpu__red\n", " \n", " \n", - " 6647\n", - " 11826167106\n", - " 2176\n", - " 68616\n", + " 7319\n", + " 11790426664\n", + " 2272\n", + " 78409\n", " NaN\n", " NaN\n", " NaN\n", @@ -939,22 +987,22 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 1,838\n", + " 1760560.0\n", " Device\n", " Pageable\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Device-to-Host]\n", " \n", " \n", - " 6648\n", - " 11826178754\n", - " 2176\n", - " 68617\n", + " 7320\n", + " 11790437737\n", + " 2304\n", + " 78410\n", " NaN\n", " NaN\n", " NaN\n", @@ -963,22 +1011,22 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 1,838\n", + " 1736108.0\n", " Device\n", " Pageable\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Device-to-Host]\n", " \n", " \n", - " 6649\n", - " 11826190114\n", - " 2176\n", - " 68618\n", + " 7321\n", + " 11790448584\n", + " 2240\n", + " 78411\n", " NaN\n", " NaN\n", " NaN\n", @@ -987,61 +1035,61 @@ " NaN\n", " NaN\n", " ...\n", - " NaN\n", - " 0,000\n", - " 1,838\n", + " 1785712.0\n", " Device\n", " Pageable\n", - " NVIDIA H100 (1)\n", + " NVIDIA H100 (2)\n", + " 2\n", + " 2308062\n", " 1\n", - " NaN\n", + " None\n", " 16\n", " [CUDA memcpy Device-to-Host]\n", " \n", " \n", "\n", - "

6650 rows × 21 columns

\n", + "

7322 rows × 23 columns

\n", "" ], "text/plain": [ - " Start (ns) Duration (ns) CorrID GrdX GrdY GrdZ BlkX BlkY \\\n", - "0 1307261431 992 688 NaN NaN NaN NaN NaN \n", - "1 1662416719 960 1285 NaN NaN NaN NaN NaN \n", - "2 1662448494 736 1291 NaN NaN NaN NaN NaN \n", - "3 1662469326 768 1297 NaN NaN NaN NaN NaN \n", - "4 1662702764 704 1327 NaN NaN NaN NaN NaN \n", - "... ... ... ... ... ... ... ... ... \n", - "6645 11824673327 1378516 68613 65535.0 1.0 1.0 32.0 1.0 \n", - "6646 11826052707 99903 68614 3.0 1.0 1.0 256.0 1.0 \n", - "6647 11826167106 2176 68616 NaN NaN NaN NaN NaN \n", - "6648 11826178754 2176 68617 NaN NaN NaN NaN NaN \n", - "6649 11826190114 2176 68618 NaN NaN NaN NaN NaN \n", + " Start (ns) Duration:dur_ns CorrID GrdX GrdY GrdZ BlkX BlkY \\\n", + "0 1271556323 960 688 NaN NaN NaN NaN NaN \n", + "1 1626687401 896 1285 NaN NaN NaN NaN NaN \n", + "2 1626718377 736 1291 NaN NaN NaN NaN NaN \n", + "3 1626741640 704 1297 NaN NaN NaN NaN NaN \n", + "4 1626968327 736 1327 NaN NaN NaN NaN NaN \n", + "... ... ... ... ... ... ... ... ... \n", + "7317 11788939184 1375193 78406 65535.0 1.0 1.0 32.0 1.0 \n", + "7318 11790315529 99840 78407 3.0 1.0 1.0 256.0 1.0 \n", + "7319 11790426664 2272 78409 NaN NaN NaN NaN NaN \n", + "7320 11790437737 2304 78410 NaN NaN NaN NaN NaN \n", + "7321 11790448584 2240 78411 NaN NaN NaN NaN NaN \n", "\n", - " BlkZ Reg/Trd ... DymSMem (MB) Bytes (MB) Throughput (MB/s) SrcMemKd \\\n", - "0 NaN NaN ... NaN 0,000 32,258 Pageable \n", - "1 NaN NaN ... NaN 0,000 133,333 Pageable \n", - "2 NaN NaN ... NaN 0,000 173,913 Pageable \n", - "3 NaN NaN ... NaN 0,000 166,667 Pageable \n", - "4 NaN NaN ... NaN 0,000 34,091 Pageable \n", - "... ... ... ... ... ... ... ... \n", - "6645 1.0 36.0 ... 0,001 NaN NaN NaN \n", - "6646 1.0 18.0 ... 0,001 NaN NaN NaN \n", - "6647 NaN NaN ... NaN 0,000 1,838 Device \n", - "6648 NaN NaN ... NaN 0,000 1,838 Device \n", - "6649 NaN NaN ... NaN 0,000 1,838 Device \n", + " BlkZ Reg/Trd ... Throughput:thru_B SrcMemKd DstMemKd \\\n", + "0 NaN NaN ... 33333312.0 Pageable Device \n", + "1 NaN NaN ... 142857088.0 Pageable Device \n", + "2 NaN NaN ... 173912960.0 Pageable Device \n", + "3 NaN NaN ... 181818112.0 Pageable Device \n", + "4 NaN NaN ... 38043460.0 Pageable Device \n", + "... ... ... ... ... ... ... \n", + "7317 1.0 36.0 ... NaN None None \n", + "7318 1.0 18.0 ... NaN None None \n", + "7319 NaN NaN ... 1760560.0 Device Pageable \n", + "7320 NaN NaN ... 1736108.0 Device Pageable \n", + "7321 NaN NaN ... 1785712.0 Device Pageable \n", "\n", - " DstMemKd Device Ctx GreenCtx Strm \\\n", - "0 Device NVIDIA H100 (1) 1 NaN 16 \n", - "1 Device NVIDIA H100 (1) 1 NaN 16 \n", - "2 Device NVIDIA H100 (1) 1 NaN 16 \n", - "3 Device NVIDIA H100 (1) 1 NaN 16 \n", - "4 Device NVIDIA H100 (1) 1 NaN 16 \n", - "... ... ... .. ... ... \n", - "6645 NaN NVIDIA H100 (1) 1 NaN 16 \n", - "6646 NaN NVIDIA H100 (1) 1 NaN 16 \n", - "6647 Pageable NVIDIA H100 (1) 1 NaN 16 \n", - "6648 Pageable NVIDIA H100 (1) 1 NaN 16 \n", - "6649 Pageable NVIDIA H100 (1) 1 NaN 16 \n", + " Device deviceid Pid Ctx GreenCtx Strm \\\n", + "0 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "1 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "2 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "3 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "4 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "... ... ... ... .. ... ... \n", + "7317 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "7318 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "7319 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "7320 NVIDIA H100 (2) 2 2308062 1 None 16 \n", + "7321 NVIDIA H100 (2) 2 2308062 1 None 16 \n", "\n", " Name \n", "0 [CUDA memcpy Host-to-Device] \n", @@ -1050,13 +1098,13 @@ "3 [CUDA memcpy Host-to-Device] \n", "4 [CUDA memcpy Host-to-Device] \n", "... ... \n", - "6645 mod_time_ops_adapt_dt_cfl_32_gpu \n", - "6646 mod_time_ops_adapt_dt_cfl_32_gpu__red \n", - "6647 [CUDA memcpy Device-to-Host] \n", - "6648 [CUDA memcpy Device-to-Host] \n", - "6649 [CUDA memcpy Device-to-Host] \n", + "7317 mod_time_ops_adapt_dt_cfl_32_gpu \n", + "7318 mod_time_ops_adapt_dt_cfl_32_gpu__red \n", + "7319 [CUDA memcpy Device-to-Host] \n", + "7320 [CUDA memcpy Device-to-Host] \n", + "7321 [CUDA memcpy Device-to-Host] \n", "\n", - "[6650 rows x 21 columns]" + "[7322 rows x 23 columns]" ] }, "execution_count": 8, @@ -1065,9 +1113,7 @@ } ], "source": [ - "kernels_df = pd.read_csv(build_nsys_stats_name(\"cuda_gpu_trace\"))\n", - "kernels_df.rename(columns={\"CorrId\": \"CorrID\"}, inplace=True)\n", - "kernels_df" + "kernels_df[2]" ] }, { -- GitLab From 59ea3ffdb70d7b58ed7f8ed532c0e93b231e1341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Fri, 27 Sep 2024 14:26:13 +0200 Subject: [PATCH 5/9] Adds OpenACC to multireport construction --- nsys2prv/parse_nsys_stats.py | 56 +- nsys2prv/semantics/__init__.py | 3 +- nsys2prv/semantics/openacc_semantic.py | 27 + parser-playground.ipynb | 1368 ++++++++++++++++++------ 4 files changed, 1116 insertions(+), 338 deletions(-) create mode 100644 nsys2prv/semantics/openacc_semantic.py diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 0490046..80f763b 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -299,14 +299,39 @@ def main(): del ks if t_openacc: - with engine.connect() as conn, conn.begin(): - with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_other.sql'), 'r') as query: - openacc_other_df = pd.read_sql_query(text(query.read()), conn) - with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_launch.sql'), 'r') as query: - openacc_launch_df = pd.read_sql_query(text(query.read()), conn) - with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_data.sql'), 'r') as query: - openacc_data_df = pd.read_sql_query(text(query.read()), conn) - openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn) + if MULTIREPORT: + openacc_other_df = [] + openacc_launch_df = [] + openacc_data_df = [] + for REPORT_FILE_I in REPORTS_LIST: + ksio = OpenACCOtherSemantic(REPORT_FILE_I) + ksio.Setup() + ksio.load_data() + openacc_other_df.append(ksio.get_df()) + ksil = OpenACCLaunchSemantic(REPORT_FILE_I) + ksil.Setup() + ksil.load_data() + openacc_launch_df.append(ksil.get_df()) + ksid = OpenACCDataSemantic(REPORT_FILE_I) + ksid.Setup() + ksid.load_data() + openacc_data_df.append(ksid.get_df()) + del ksio, ksil, ksid + else: + kso = OpenACCOtherSemantic(REPORT_FILE_I) + kso.Setup() + kso.load_data() + openacc_other_df = kso.get_df() + ksl = OpenACCLaunchSemantic(REPORT_FILE_I) + ksl.Setup() + ksl.load_data() + openacc_launch_df = ksl.get_df() + ksd = OpenACCDataSemantic(REPORT_FILE_I) + ksd.Setup() + ksd.load_data() + openacc_data_df = ksd.get_df() + del kso, ksl, ksd + openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", f"sqlite:///{os.path.splitext(REPORTS_LIST[0])[0]}.sqlite") # MARK: CONTEXT INFO @@ -374,9 +399,22 @@ def main(): df['End:ts_ns'] += deltas[i] mpi_df = pd.concat(mpi_df, ignore_index=True) + if t_openacc: + for i, df in enumerate(openacc_other_df): + df['start'] += deltas[i] + df['end'] += deltas[i] + for i, df in enumerate(openacc_launch_df): + df['start'] += deltas[i] + df['end'] += deltas[i] + for i, df in enumerate(openacc_data_df): + df['start'] += deltas[i] + df['end'] += deltas[i] + openacc_other_df = pd.concat(openacc_other_df, ignore_index=True) + openacc_launch_df = pd.concat(openacc_launch_df, ignore_index=True) + openacc_data_df = pd.concat(openacc_data_df, ignore_index=True) + #if t_metrics: - #if t_openacc: # MARK: PROCESS MODEL diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py index 460fd05..1976280 100644 --- a/nsys2prv/semantics/__init__.py +++ b/nsys2prv/semantics/__init__.py @@ -1,4 +1,5 @@ from .kernels_semantic import KernelsSemantic from .mpi_semantic import * from .nvtx_startend_semantic import NVTXStartEndSemantic -from .gpu_metrics_semantic import GPUMetricsSemantic \ No newline at end of file +from .gpu_metrics_semantic import GPUMetricsSemantic +from .openacc_semantic import * \ No newline at end of file diff --git a/nsys2prv/semantics/openacc_semantic.py b/nsys2prv/semantics/openacc_semantic.py new file mode 100644 index 0000000..1059719 --- /dev/null +++ b/nsys2prv/semantics/openacc_semantic.py @@ -0,0 +1,27 @@ +from .nsys_event import NsysEvent +import os.path +from sqlalchemy import text + +class OpenACCOtherSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_other.sql'), 'r') as query: + self.query = text(query.read()) + +class OpenACCLaunchSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_launch.sql'), 'r') as query: + self.query = text(query.read()) + +class OpenACCDataSemantic(NsysEvent): + def __init__(self, report) -> None: + super().__init__(report) + + def Setup(self): + with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_data.sql'), 'r') as query: + self.query = text(query.read()) \ No newline at end of file diff --git a/parser-playground.ipynb b/parser-playground.ipynb index cb91839..c4b2ad5 100644 --- a/parser-playground.ipynb +++ b/parser-playground.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -697,10 +697,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "list_contexts = []\n", + "if MULTIREPORT:\n", + " for REPORT_FILE_I in REPORTS_LIST:\n", + " context_info_i = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\")\n", + " list_contexts.append(context_info_i)\n", + " context_info = pd.concat(list_contexts)\n", + "else:\n", + " context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n", + "context_info.sort_values([\"processId\"], inplace=True)\n", + "\n", + "# CONTEXT INFO CHECK FOR MULTIREPORT\n", + "if context_info[\"deviceId\"].unique().size == 0:\n", + " print(f\"\\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \\033[00m\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ + "t_apicalls = True\n", "cuda_api_df = []\n", "if MULTIREPORT:\n", " for i, REPORT_FILE_I in enumerate(REPORTS_LIST):\n", @@ -734,7 +756,66 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After concat: (21299, 23)\n" + ] + } + ], + "source": [ + "from functools import reduce\n", + "# MARK: MERGING AND ALIGNING\n", + "if MULTIREPORT:\n", + " # Find delta between earliest trace start and the others\n", + " session_time = []\n", + " for REPORT_FILE_I in REPORTS_LIST:\n", + " session_time.append(pd.read_sql_table(\"TARGET_INFO_SESSION_START_TIME\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\"))\n", + " \n", + " session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs\n", + " earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf'))\n", + " deltas = [start - earliest_time for start in session_time]\n", + " for i, df in enumerate(kernels_df):\n", + " df['Start (ns)'] += deltas[i]\n", + " kernels_df = pd.concat(kernels_df)\n", + " print(f\"After concat: {kernels_df.shape}\")\n", + "\n", + " if t_apicalls:\n", + " for i, df in enumerate(cuda_api_df):\n", + " df['Start (ns)'] += deltas[i]\n", + "\n", + " cuda_api_df = pd.concat(cuda_api_df)\n", + "\n", + " # if t_nvtx:\n", + " # for i, df in enumerate(nvtx_df):\n", + " # df['Start (ns)'] += deltas[i]\n", + " # df['End (ns)'] += deltas[i]\n", + " # nvtx_df = pd.concat(nvtx_df)\n", + "\n", + " # if t_nvtx_startend:\n", + " # for i, df in enumerate(nvtx_startend_df):\n", + " # df['Start (ns)'] += deltas[i]\n", + " # df['End (ns)'] += deltas[i]\n", + " # nvtx_startend_df = pd.concat(nvtx_startend_df)\n", + "\n", + " # if t_mpi:\n", + " # for i, df in enumerate(mpi_df):\n", + " # df['Start:ts_ns'] += deltas[i]\n", + " # df['End:ts_ns'] += deltas[i]\n", + " # mpi_df = pd.concat(mpi_df)\n", + " \n", + " #if t_metrics:\n", + "\n", + " #if t_openacc:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -758,227 +839,628 @@ " \n", " \n", " \n", - " Start (ns)\n", - " Duration:dur_ns\n", - " CorrID\n", - " GrdX\n", - " GrdY\n", - " GrdZ\n", - " BlkX\n", - " BlkY\n", - " BlkZ\n", - " Reg/Trd\n", - " ...\n", - " Throughput:thru_B\n", - " SrcMemKd\n", - " DstMemKd\n", - " Device\n", - " deviceid\n", " Pid\n", - " Ctx\n", - " GreenCtx\n", - " Strm\n", - " Name\n", + " Tid\n", + " thread\n", + " device\n", " \n", - " \n", - " \n", " \n", - " 0\n", - " 1271556323\n", - " 960\n", - " 688\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " 33333312.0\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (2)\n", - " 2\n", - " 2308062\n", - " 1\n", - " None\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " task\n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " 1\n", - " 1626687401\n", - " 896\n", - " 1285\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " 142857088.0\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (2)\n", - " 2\n", - " 2308062\n", + " 2308061\n", + " {2308061}\n", " 1\n", - " None\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " 0\n", " \n", " \n", " 2\n", - " 1626718377\n", - " 736\n", - " 1291\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " 173912960.0\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (2)\n", - " 2\n", " 2308062\n", + " {2308062}\n", " 1\n", - " None\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " 2\n", " \n", " \n", " 3\n", - " 1626741640\n", - " 704\n", - " 1297\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " 181818112.0\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (2)\n", - " 2\n", - " 2308062\n", + " 2308065\n", + " {2308065}\n", " 1\n", - " None\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", - " \n", - " \n", - " 4\n", - " 1626968327\n", - " 736\n", - " 1327\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " 38043460.0\n", - " Pageable\n", - " Device\n", - " NVIDIA H100 (2)\n", - " 2\n", - " 2308062\n", " 1\n", - " None\n", - " 16\n", - " [CUDA memcpy Host-to-Device]\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " Pid Tid thread device\n", + "task \n", + "1 2308061 {2308061} 1 0\n", + "2 2308062 {2308062} 1 2\n", + "3 2308065 {2308065} 1 1" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compute_threads_with = []\n", + "compute_threads_with.append(cuda_api_df[['Pid', 'Tid']])\n", + "# if t_nvtx: compute_threads_with.append(nvtx_df[[\"Pid\", \"Tid\"]])\n", + "# if t_nvtx_startend: compute_threads_with.append(nvtx_startend_df[[\"Pid\", \"Tid\"]])\n", + "# if t_mpi: compute_threads_with.append(mpi_df[[\"Pid\", \"Tid\"]])\n", + "# if t_openacc: compute_threads_with.append(openacc_other_df[[\"Pid\", \"Tid\"]])\n", + "\n", + "t_mpi = False\n", + "threads = pd.concat(compute_threads_with).drop_duplicates()\n", + "if t_mpi:\n", + " threads[\"Rank\"] = threads[\"Pid\"].map(rank_info.set_index(\"Pid\")[\"rank\"])\n", + " threads.sort_values([\"Rank\"], inplace=True)\n", + "else:\n", + " threads.sort_values([\"Pid\"], inplace=True)\n", + "threads[\"thread\"] = threads.groupby([\"Pid\"]).cumcount() + 1\n", + "threads[\"task\"] = threads.groupby([\"Pid\"]).ngroup() + 1\n", + "threads[\"device\"] = threads[\"Pid\"].map(context_info[context_info[\"contextId\"] == 1].set_index(\"processId\")[\"deviceId\"])\n", + "#threads.sort_values([\"task\", \"thread\"], inplace=True)\n", + "threads.reset_index()\n", + "\n", + "tasks_set = threads.groupby([\"task\"]).agg({'Pid': 'first',\n", + " 'Tid': lambda x: set(x),\n", + " 'thread': 'count',\n", + " 'device': 'first' })\n", + "tasks_set" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DeviceStrmthreadtaskPid
..................................................................Piddeviceid
73171178893918413751937840665535.01.01.032.01.01.036.0...NaNNoneNoneNVIDIA H100 (2)2230806223080610NVIDIA H100 (0){16, 17, 18, 19}41None16mod_time_ops_adapt_dt_cfl_32_gpu2308061
73181179031552999840784073.01.01.0256.01.01.018.0...NaNNoneNone23080622NVIDIA H100 (2){16, 17, 18, 19}4223080621
23080651NVIDIA H100 (1){16, 17, 18}332308065
\n", + "
" + ], + "text/plain": [ + " Device Strm thread task Pid\n", + "Pid deviceid \n", + "2308061 0 NVIDIA H100 (0) {16, 17, 18, 19} 4 1 2308061\n", + "2308062 2 NVIDIA H100 (2) {16, 17, 18, 19} 4 2 2308062\n", + "2308065 1 NVIDIA H100 (1) {16, 17, 18} 3 3 2308065" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates()\n", + "streams[\"thread\"] = streams.groupby([\"Pid\", \"Device\"]).cumcount() + 1\n", + "#streams[\"deviceid\"] = streams.sort_values(\"Device\").groupby([\"Device\"]).ngroup()\n", + "#streams[\"Pid\"] = streams[\"deviceid\"].map(tasks_set.set_index(\"device\")[\"Pid\"])\n", + "streams[\"task\"] = streams[\"Pid\"].map(tasks_set.reset_index().set_index(\"Pid\")[\"task\"])\n", + "\n", + "streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str)\n", + "num_streams = streams.count().iloc[0]\n", + "streams.sort_values([\"Pid\", \"thread\"], inplace=True)\n", + "streams.reset_index(inplace=True)\n", + "\n", + "devices_set = streams.groupby([\"Pid\", \"deviceid\"]).agg({'Device': 'first',\n", + " 'Strm': lambda x: set(x),\n", + " 'thread': 'count',\n", + " 'task': 'first',\n", + " 'Pid': 'last'})\n", + "devices_set" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexDeviceStrmdeviceidPidthreadtaskrow_name
00NVIDIA H100 (0)160230806121CUDA-D0.S16
1190NVIDIA H100 (0)170230806131CUDA-D0.S17
2191NVIDIA H100 (0)180230806141CUDA-D0.S18
3204NVIDIA H100 (0)190230806151CUDA-D0.S19
40NVIDIA H100 (2)162230806222CUDA-D2.S16
5190NVIDIA H100 (2)172230806232CUDA-D2.S17
6191NVIDIA H100 (2)182230806242CUDA-D2.S18
7203NVIDIA H100 (2)192230806252CUDA-D2.S19
80NVIDIA H100 (1)161230806523CUDA-D1.S16
9190NVIDIA H100 (1)171230806533CUDA-D1.S17
10191NVIDIA H100 (1)181230806543CUDA-D1.S18
\n", + "
" + ], + "text/plain": [ + " index Device Strm deviceid Pid thread task row_name\n", + "0 0 NVIDIA H100 (0) 16 0 2308061 2 1 CUDA-D0.S16\n", + "1 190 NVIDIA H100 (0) 17 0 2308061 3 1 CUDA-D0.S17\n", + "2 191 NVIDIA H100 (0) 18 0 2308061 4 1 CUDA-D0.S18\n", + "3 204 NVIDIA H100 (0) 19 0 2308061 5 1 CUDA-D0.S19\n", + "4 0 NVIDIA H100 (2) 16 2 2308062 2 2 CUDA-D2.S16\n", + "5 190 NVIDIA H100 (2) 17 2 2308062 3 2 CUDA-D2.S17\n", + "6 191 NVIDIA H100 (2) 18 2 2308062 4 2 CUDA-D2.S18\n", + "7 203 NVIDIA H100 (2) 19 2 2308062 5 2 CUDA-D2.S19\n", + "8 0 NVIDIA H100 (1) 16 1 2308065 2 3 CUDA-D1.S16\n", + "9 190 NVIDIA H100 (1) 17 1 2308065 3 3 CUDA-D1.S17\n", + "10 191 NVIDIA H100 (1) 18 1 2308065 4 3 CUDA-D1.S18" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_normal_threads = tasks_set['thread']\n", + "num_normal_threads_repeated = num_normal_threads.repeat(devices_set[\"thread\"]).reset_index()[[\"thread\"]]\n", + "streams['thread'] = streams['thread'] + num_normal_threads_repeated[\"thread\"]\n", + "streams" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of original kernels_df: (21299, 25). Shape of result_df: (21299, 27).\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'thread'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'thread'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[19], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of original kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Shape of result_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Copy the results back to kernels_df\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mthread\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mresult_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mthread\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m 8\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m result_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of new kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'thread'" + ] + } + ], + "source": [ + "filtered_streams = streams.groupby([\"Pid\", \"Strm\"]).agg({'thread':'first', 'task':'first'}).reset_index()\n", + "# Now, merge the filtered streams DataFrame with kernels_df\n", + "result_df = kernels_df.merge(filtered_streams, how='left', on=[\"Pid\", 'Strm'])\n", + "print(f\"Shape of original kernels_df: {kernels_df.shape}. Shape of result_df: {result_df.shape}.\")\n", + "\n", + "# Copy the results back to kernels_df\n", + "kernels_df['thread'] = result_df['thread'].to_numpy()\n", + "kernels_df['task'] = result_df['task'].to_numpy()\n", + "print(f\"Shape of new kernels_df: {kernels_df.shape}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -987,8 +1469,6 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -997,12 +1477,14 @@ " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1011,22 +1493,70 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1035,8 +1565,6 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -1045,145 +1573,329 @@ " \n", " \n", " \n", + " \n", + " \n", " \n", - " \n", - "
Start (ns)Duration:dur_nsCorrIDGrdXGrdYGrdZBlkXBlkYBlkZReg/Trd...DstMemKdDevicedeviceidPidCtxGreenCtxStrmNamethreadtask
202310256971039227223576NaNNaNNaNNaNNaNNaNNaN...PageableNVIDIA H100 (2)223080621None16[CUDA memcpy Device-to-Host]22
20241025704355111056902359081902.01.01.0128.01.01.032.0...NoneNVIDIA H100 (2)223080621None16mod_solver_conjgrad_imex_245_gpu22
202510258164089511682359581902.01.01.0128.01.01.026.0...NoneNVIDIA H100 (2)223080621None16mod_bc_routines_bc_fix_dirichlet_residual_293_gpu22
20261025822716176823598NaNNaNNaNNaNNaNNaNNaN...NoneNVIDIA H100 (2)223080621None16[CUDA memset]22
2027102582362818975632360165535.01.01.0128.01.01.056.0...NoneNVIDIA H100 (2)223080621None16mod_solver_conjgrad_imex_271_gpu22
20281025913502897632236021.01.01.0256.01.01.016.0...NoneNVIDIA H100 (2)223080621None16mod_time_ops_adapt_dt_cfl_32_gpu__redmod_solver_conjgrad_imex_271_gpu__red22
731911790426664227278409202910259241107227323604NaNNaNNaNNaNNaN...1760560.0DevicePageableNVIDIA H100 (2)2None16[CUDA memcpy Device-to-Host]22
73201179043773723047841020301025928593976823616NaNNaNNaNNaNNaN...1736108.0DevicePageableNoneNVIDIA H100 (2)223080621None16[CUDA memcpy Device-to-Host][CUDA memset]22
2031102592948364127012361965535.01.01.0128.01.01.040.0...NoneNVIDIA H100 (2)223080621None16mod_solver_conjgrad_imex_297_gpu22
20321025970846596064236201.01.01.0256.01.01.016.0...NoneNVIDIA H100 (2)223080621None16mod_solver_conjgrad_imex_297_gpu__red22
732111790448584224078411203310259813009227223622NaNNaNNaNNaNNaN...1785712.0DevicePageableNVIDIA H100 (2)2None16[CUDA memcpy Device-to-Host]22
\n", - "

7322 rows × 23 columns

\n", - "
" - ], - "text/plain": [ - " Start (ns) Duration:dur_ns CorrID GrdX GrdY GrdZ BlkX BlkY \\\n", - "0 1271556323 960 688 NaN NaN NaN NaN NaN \n", - "1 1626687401 896 1285 NaN NaN NaN NaN NaN \n", - "2 1626718377 736 1291 NaN NaN NaN NaN NaN \n", - "3 1626741640 704 1297 NaN NaN NaN NaN NaN \n", - "4 1626968327 736 1327 NaN NaN NaN NaN NaN \n", - "... ... ... ... ... ... ... ... ... \n", - "7317 11788939184 1375193 78406 65535.0 1.0 1.0 32.0 1.0 \n", - "7318 11790315529 99840 78407 3.0 1.0 1.0 256.0 1.0 \n", - "7319 11790426664 2272 78409 NaN NaN NaN NaN NaN \n", - "7320 11790437737 2304 78410 NaN NaN NaN NaN NaN \n", - "7321 11790448584 2240 78411 NaN NaN NaN NaN NaN \n", - "\n", - " BlkZ Reg/Trd ... Throughput:thru_B SrcMemKd DstMemKd \\\n", - "0 NaN NaN ... 33333312.0 Pageable Device \n", - "1 NaN NaN ... 142857088.0 Pageable Device \n", - "2 NaN NaN ... 173912960.0 Pageable Device \n", - "3 NaN NaN ... 181818112.0 Pageable Device \n", - "4 NaN NaN ... 38043460.0 Pageable Device \n", - "... ... ... ... ... ... ... \n", - "7317 1.0 36.0 ... NaN None None \n", - "7318 1.0 18.0 ... NaN None None \n", - "7319 NaN NaN ... 1760560.0 Device Pageable \n", - "7320 NaN NaN ... 1736108.0 Device Pageable \n", - "7321 NaN NaN ... 1785712.0 Device Pageable \n", - "\n", - " Device deviceid Pid Ctx GreenCtx Strm \\\n", - "0 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "1 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "2 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "3 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "4 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "... ... ... ... .. ... ... \n", - "7317 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "7318 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "7319 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "7320 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "7321 NVIDIA H100 (2) 2 2308062 1 None 16 \n", - "\n", - " Name \n", - "0 [CUDA memcpy Host-to-Device] \n", - "1 [CUDA memcpy Host-to-Device] \n", - "2 [CUDA memcpy Host-to-Device] \n", - "3 [CUDA memcpy Host-to-Device] \n", - "4 [CUDA memcpy Host-to-Device] \n", - "... ... \n", - "7317 mod_time_ops_adapt_dt_cfl_32_gpu \n", - "7318 mod_time_ops_adapt_dt_cfl_32_gpu__red \n", - "7319 [CUDA memcpy Device-to-Host] \n", - "7320 [CUDA memcpy Device-to-Host] \n", - "7321 [CUDA memcpy Device-to-Host] \n", - "\n", - "[7322 rows x 23 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kernels_df[2]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
nullStreamIdhwIdvmIdprocessIddeviceIdcontextIdparentContextIdisGreenContext
2034102598408817514842363481902.01.01.0128.01.01.045.0...NoneNVIDIA H100 (2)223080621None16mod_solver_conjgrad_imex_307_gpu22
07020351026060919714927923640245704.01.01.0128.01.01.036.0...NoneNVIDIA H100 (2)2230806212308065None16elem_diffu_full_diffusion_ijk_51_gpu22
203610260759628555842364281902.01.01.0128.01.01.016.0...NoneNVIDIA H100 (2)223080621None16elem_diffu_full_diffusion_ijk_52_gpu22
2037102608296437861560236461246149.01.01.096.01.01.093.0...NoneNVIDIA H100 (2)22308062100None16elem_diffu_full_diffusion_ijk_60_gpu22
2038102687079398192236513329.01.01.0128.01.01.016.0...NoneNVIDIA H100 (2)223080621None16mod_comms_fill_sendbuffer_real_239_gpu22
2039102687273313392236553329.01.01.0128.01.01.016.0...NoneNVIDIA H100 (2)223080621None16mod_comms_fill_sendbuffer_real_246_gpu22
204010268754723467223663NaNNaNNaNNaNNaNNaNNaN...DeviceNVIDIA H100 (2)223080621None17[CUDA memcpy Peer-to-Peer]32
204110268759875332823669NaNNaNNaNNaNNaNNaNNaN...DeviceNVIDIA H100 (2)223080621None19[CUDA memcpy Peer-to-Peer]52
204210268769987556823688NaNNaNNaNNaNNaNNaNNaN...DeviceNVIDIA H100 (2)223080621None18[CUDA memcpy Peer-to-Peer]42
\n", + "

20 rows × 25 columns

\n", "
" ], "text/plain": [ - " nullStreamId hwId vmId processId deviceId contextId parentContextId \\\n", - "0 7 0 1 2308065 1 1 0 \n", + " Start (ns) Duration:dur_ns CorrID GrdX GrdY GrdZ BlkX \\\n", + "2023 10256971039 2272 23576 NaN NaN NaN NaN \n", + "2024 10257043551 1105690 23590 81902.0 1.0 1.0 128.0 \n", + "2025 10258164089 51168 23595 81902.0 1.0 1.0 128.0 \n", + "2026 10258227161 768 23598 NaN NaN NaN NaN \n", + "2027 10258236281 897563 23601 65535.0 1.0 1.0 128.0 \n", + "2028 10259135028 97632 23602 1.0 1.0 1.0 256.0 \n", + "2029 10259241107 2273 23604 NaN NaN NaN NaN \n", + "2030 10259285939 768 23616 NaN NaN NaN NaN \n", + "2031 10259294836 412701 23619 65535.0 1.0 1.0 128.0 \n", + "2032 10259708465 96064 23620 1.0 1.0 1.0 256.0 \n", + "2033 10259813009 2272 23622 NaN NaN NaN NaN \n", + "2034 10259840881 751484 23634 81902.0 1.0 1.0 128.0 \n", + "2035 10260609197 149279 23640 245704.0 1.0 1.0 128.0 \n", + "2036 10260759628 55584 23642 81902.0 1.0 1.0 128.0 \n", + "2037 10260829643 7861560 23646 1246149.0 1.0 1.0 96.0 \n", + "2038 10268707939 8192 23651 3329.0 1.0 1.0 128.0 \n", + "2039 10268727331 3392 23655 3329.0 1.0 1.0 128.0 \n", + "2040 10268754723 4672 23663 NaN NaN NaN NaN \n", + "2041 10268759875 3328 23669 NaN NaN NaN NaN \n", + "2042 10268769987 5568 23688 NaN NaN NaN NaN \n", + "\n", + " BlkY BlkZ Reg/Trd ... DstMemKd Device deviceid Pid \\\n", + "2023 NaN NaN NaN ... Pageable NVIDIA H100 (2) 2 2308062 \n", + "2024 1.0 1.0 32.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2025 1.0 1.0 26.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2026 NaN NaN NaN ... None NVIDIA H100 (2) 2 2308062 \n", + "2027 1.0 1.0 56.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2028 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2029 NaN NaN NaN ... Pageable NVIDIA H100 (2) 2 2308062 \n", + "2030 NaN NaN NaN ... None NVIDIA H100 (2) 2 2308062 \n", + "2031 1.0 1.0 40.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2032 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2033 NaN NaN NaN ... Pageable NVIDIA H100 (2) 2 2308062 \n", + "2034 1.0 1.0 45.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2035 1.0 1.0 36.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2036 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2037 1.0 1.0 93.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2038 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2039 1.0 1.0 16.0 ... None NVIDIA H100 (2) 2 2308062 \n", + "2040 NaN NaN NaN ... Device NVIDIA H100 (2) 2 2308062 \n", + "2041 NaN NaN NaN ... Device NVIDIA H100 (2) 2 2308062 \n", + "2042 NaN NaN NaN ... Device NVIDIA H100 (2) 2 2308062 \n", + "\n", + " Ctx GreenCtx Strm Name \\\n", + "2023 1 None 16 [CUDA memcpy Device-to-Host] \n", + "2024 1 None 16 mod_solver_conjgrad_imex_245_gpu \n", + "2025 1 None 16 mod_bc_routines_bc_fix_dirichlet_residual_293_gpu \n", + "2026 1 None 16 [CUDA memset] \n", + "2027 1 None 16 mod_solver_conjgrad_imex_271_gpu \n", + "2028 1 None 16 mod_solver_conjgrad_imex_271_gpu__red \n", + "2029 1 None 16 [CUDA memcpy Device-to-Host] \n", + "2030 1 None 16 [CUDA memset] \n", + "2031 1 None 16 mod_solver_conjgrad_imex_297_gpu \n", + "2032 1 None 16 mod_solver_conjgrad_imex_297_gpu__red \n", + "2033 1 None 16 [CUDA memcpy Device-to-Host] \n", + "2034 1 None 16 mod_solver_conjgrad_imex_307_gpu \n", + "2035 1 None 16 elem_diffu_full_diffusion_ijk_51_gpu \n", + "2036 1 None 16 elem_diffu_full_diffusion_ijk_52_gpu \n", + "2037 1 None 16 elem_diffu_full_diffusion_ijk_60_gpu \n", + "2038 1 None 16 mod_comms_fill_sendbuffer_real_239_gpu \n", + "2039 1 None 16 mod_comms_fill_sendbuffer_real_246_gpu \n", + "2040 1 None 17 [CUDA memcpy Peer-to-Peer] \n", + "2041 1 None 19 [CUDA memcpy Peer-to-Peer] \n", + "2042 1 None 18 [CUDA memcpy Peer-to-Peer] \n", + "\n", + " thread task \n", + "2023 2 2 \n", + "2024 2 2 \n", + "2025 2 2 \n", + "2026 2 2 \n", + "2027 2 2 \n", + "2028 2 2 \n", + "2029 2 2 \n", + "2030 2 2 \n", + "2031 2 2 \n", + "2032 2 2 \n", + "2033 2 2 \n", + "2034 2 2 \n", + "2035 2 2 \n", + "2036 2 2 \n", + "2037 2 2 \n", + "2038 2 2 \n", + "2039 2 2 \n", + "2040 3 2 \n", + "2041 5 2 \n", + "2042 4 2 \n", "\n", - " isGreenContext \n", - "0 0 " + "[20 rows x 25 columns]" ] }, - "execution_count": 7, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n", - "context_info" + "kernels_df.iloc[16000:16020]" ] }, { -- GitLab From ac26f72a3ef001bc6e538b45e490b23a467292f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Mon, 7 Oct 2024 19:20:18 +0200 Subject: [PATCH 6/9] Changes for dev prerelease --- nsys2prv/parse_nsys_stats.py | 1 + pyproject.toml | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 80f763b..3fc1386 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -285,6 +285,7 @@ def main(): gpu_metrics_agg = [] if t_metrics: if MULTIREPORT: + raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!") for REPORT_FILE_I in REPORTS_LIST: ksi = GPUMetricsSemantic(REPORT_FILE_I) ksi.Setup() diff --git a/pyproject.toml b/pyproject.toml index 5b3f160..d0827c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,13 @@ [tool.poetry] name = "nsys2prv" -version = "0.3.1" +version = "0.4.0-dev20241007" description = "Translate a NVIDIA Nsight System trace to a Paraver trace" authors = ["Marc Clascà "] readme = "README.md" license = "GPL-3.0-only" include = [ - "cfgs" + "cfgs", + "docs" ] repository = "https://pm.bsc.es/gitlab/beppp/nsys2prv" homepage = "https://pm.bsc.es/gitlab/beppp/nsys2prv" -- GitLab From 8f9423ed71871497ad63e9130367b9ba0bf5175d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Mon, 14 Oct 2024 19:13:45 +0200 Subject: [PATCH 7/9] Adds trycatch for MPI and solves error for single report --- nsys2prv/parse_nsys_stats.py | 109 ++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 3fc1386..54c95c8 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -57,7 +57,7 @@ def main(): REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST] REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report else: - REPORT_FILE = os.path.abspath(args.source_rep.first) + REPORT_FILE = os.path.abspath(args.source_rep[0]) REPORT_DIR = os.path.dirname(REPORT_FILE) trace_name = args.output @@ -228,57 +228,62 @@ def main(): if t_mpi: mpi_df = [] - if MULTIREPORT: - for REPORT_FILE_I in REPORTS_LIST: - kp2pi = MPIP2PSemantic(REPORT_FILE_I) - kp2pi.Setup() - kp2pi.load_data() - - kcolli = MPICollSemantic(REPORT_FILE_I) - kcolli.Setup() - kcolli.load_data() - - kotheri = MPIOtherSemantic(REPORT_FILE_I) - kotheri.Setup() - kotheri.load_data() - - krmai = MPIRMASemantic(REPORT_FILE_I) - krmai.Setup() - krmai.load_data() - - kioi = MPIIOPSemantic(REPORT_FILE_I) - kioi.Setup() - kioi.load_data() - - mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True)) - del kp2pi, kcolli, kotheri, krmai, kioi - else: - kmpi = MPIP2PSemantic(REPORT_FILE) - kmpi.Setup() - kmpi.load_data() - mpi_p2p_df = kmpi.get_df() - - kmpi = MPICollSemantic(REPORT_FILE) - kmpi.Setup() - kmpi.load_data() - mpi_coll_df = kmpi.get_df() - - kmpi = MPIOtherSemantic(REPORT_FILE) - kmpi.Setup() - kmpi.load_data() - mpi_other_df = kmpi.get_df() - - kmpi = MPIRMASemantic(REPORT_FILE) - kmpi.Setup() - kmpi.load_data() - mpi_rma_df = kmpi.get_df() - - kmpi = MPIIOPSemantic(REPORT_FILE) - kmpi.Setup() - kmpi.load_data() - mpi_io_df = kmpi.get_df() - mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True) - del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df + try: + if MULTIREPORT: + for REPORT_FILE_I in REPORTS_LIST: + kp2pi = MPIP2PSemantic(REPORT_FILE_I) + kp2pi.Setup() + kp2pi.load_data() + + kcolli = MPICollSemantic(REPORT_FILE_I) + kcolli.Setup() + kcolli.load_data() + + kotheri = MPIOtherSemantic(REPORT_FILE_I) + kotheri.Setup() + kotheri.load_data() + + krmai = MPIRMASemantic(REPORT_FILE_I) + krmai.Setup() + krmai.load_data() + + kioi = MPIIOPSemantic(REPORT_FILE_I) + kioi.Setup() + kioi.load_data() + + mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True)) + del kp2pi, kcolli, kotheri, krmai, kioi + else: + kmpi = MPIP2PSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_p2p_df = kmpi.get_df() + + kmpi = MPICollSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_coll_df = kmpi.get_df() + + kmpi = MPIOtherSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_other_df = kmpi.get_df() + + kmpi = MPIRMASemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_rma_df = kmpi.get_df() + + kmpi = MPIIOPSemantic(REPORT_FILE) + kmpi.Setup() + kmpi.load_data() + mpi_io_df = kmpi.get_df() + mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True) + del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df + except OperationalError as oe: + print("There has been a problem fetching MPI information. MPI data will be skipped.") + print(f"[ERROR]: {oe.args[0]}") + t_mpi = False else: mpi_df = pd.DataFrame() -- GitLab From 18a46502c8d6fcd2aff8a139c6ebeecaa77fb796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Wed, 6 Nov 2024 17:22:26 +0100 Subject: [PATCH 8/9] Adds support for GPU metrics multi report merging when each report contains metrics for random GPU ids not corresponding to self process model. This still not works for multi-node multi-report traces, where multiple devices with same id might appear. --- .vscode/launch.json | 32 ++++++++++++++++++ nsys2prv/parse_nsys_stats.py | 38 +++++++++++++--------- nsys2prv/semantics/gpu_metrics_semantic.py | 13 +++++--- 3 files changed, 64 insertions(+), 19 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..95c3117 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,32 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "nsys2prv/parse_nsys_stats.py", + "console": "integratedTerminal", + "args": [ + "-t", + "cuda_api_trace,mpi_event_trace,gpu_metrics", + "-m", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_0.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_1.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_2.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_3.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_4.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_5.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_6.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_7.nsys-rep", + "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_ricardo_metrics_4nodes_more" + ], + "env": { + "NSYS_HOME": "/home/mclasca/Apps/nsight-system/2024.5.1" + } + } + ] +} \ No newline at end of file diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index 54c95c8..d96b3ae 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -11,10 +11,10 @@ import locale from functools import reduce from sqlalchemy import create_engine, text, dialects from sqlalchemy.exc import OperationalError -from .EventWriter import event_writer as ewr -from .NSYSInterface import NSYSInterface -from .semantics.mpi_event_encoding import * -from .semantics import * +from EventWriter import event_writer as ewr +from NSYSInterface import NSYSInterface +from semantics.mpi_event_encoding import * +from semantics import * def main(): locale.setlocale(locale.LC_ALL, '') @@ -288,20 +288,23 @@ def main(): mpi_df = pd.DataFrame() gpu_metrics_agg = [] + metrics_event_names = [] if t_metrics: if MULTIREPORT: - raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!") + #raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!") for REPORT_FILE_I in REPORTS_LIST: ksi = GPUMetricsSemantic(REPORT_FILE_I) ksi.Setup() ksi.load_data() gpu_metrics_agg.append(ksi.get_df()) + metrics_event_names.append(ksi.get_names()) del ksi else: ks = GPUMetricsSemantic(REPORT_FILE) ks.Setup() ks.load_data() gpu_metrics_agg = ks.get_df() + metrics_event_names = ks.get_names() del ks if t_openacc: @@ -352,7 +355,7 @@ def main(): context_info.sort_values(["processId"], inplace=True) # CONTEXT INFO CHECK FOR MULTIREPORT - if context_info["deviceId"].unique().size == 0: + if context_info["deviceId"].unique().size == 1: print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \033[00m") if t_mpi: @@ -379,7 +382,6 @@ def main(): for i, df in enumerate(kernels_df): df['Start (ns)'] += deltas[i] kernels_df = pd.concat(kernels_df, ignore_index=True) - print(f"After concat: {kernels_df.shape}") if t_apicalls: for i, df in enumerate(cuda_api_df): @@ -419,9 +421,16 @@ def main(): openacc_launch_df = pd.concat(openacc_launch_df, ignore_index=True) openacc_data_df = pd.concat(openacc_data_df, ignore_index=True) - #if t_metrics: - - + if t_metrics: + for i, df in enumerate(gpu_metrics_agg): + if not df.empty: + df['timestamp'] += deltas[i] + # Complement with processId info + #df['Pid'] = list_contexts[i].iloc[0]['processId'] # Assuming one report per GPU device. This needs to be improved. + + df['Pid'] = df['deviceId'].map(context_info.set_index("deviceId")["processId"]) + gpu_metrics_agg = pd.concat(gpu_metrics_agg, ignore_index=True) + metrics_event_names = pd.concat(metrics_event_names, ignore_index=True).drop_duplicates() # MARK: PROCESS MODEL @@ -530,8 +539,7 @@ def main(): devices_set = streams.groupby(["Pid", "deviceid"]).agg({'Device': 'first', 'Strm': lambda x: set(x), 'thread': 'count', - 'task': 'first', - 'Pid': 'last'}) + 'task': 'first'}) print(devices_set) # Here we finally update the threadId we are going to put in the event record of kernel executions to respect the normal threads before CUDA streams @@ -557,12 +565,12 @@ def main(): # Add auxiliary stream to streams dataframe if t_metrics: - aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task"]] + aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task", "Pid"]] aux_streams["Strm"] = 99 aux_streams["row_name"] = "Metrics GPU"+aux_streams["deviceid"].astype(str) - aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"]) + #aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"]) aux_streams["thread"] = aux_streams["thread"] + aux_streams["deviceid"].map(tasks_set.set_index('device')['thread']) + 1 - gpu_metrics_agg["task"] = gpu_metrics_agg["deviceId"].map(devices_set["task"]) + gpu_metrics_agg["task"] = gpu_metrics_agg["Pid"].map(devices_set.reset_index().set_index("Pid")["task"]) gpu_metrics_agg["thread"] = gpu_metrics_agg["task"].map(aux_streams.set_index('task')["thread"]) streams = pd.concat([streams, aux_streams]).sort_values(['task', 'thread']) diff --git a/nsys2prv/semantics/gpu_metrics_semantic.py b/nsys2prv/semantics/gpu_metrics_semantic.py index 097682a..6776bb4 100644 --- a/nsys2prv/semantics/gpu_metrics_semantic.py +++ b/nsys2prv/semantics/gpu_metrics_semantic.py @@ -1,5 +1,5 @@ from .nsys_event import NsysEvent -from pandas import read_sql_table +from pandas import read_sql_table, DataFrame from sqlalchemy import text event_type_metrics_base = 9400 @@ -7,22 +7,27 @@ event_type_metrics_base = 9400 class GPUMetricsSemantic(NsysEvent): def __init__(self, report) -> None: + self.metrics_event_names = DataFrame() super().__init__(report) def Setup(self): if self.check_table("GPU_METRICS"): self.query = text("SELECT * FROM GPU_METRICS") else: + self._empty = True def _preprocess(self): metrics_description = read_sql_table("TARGET_INFO_GPU_METRICS", self._dbcon) self._df.drop(self._df[self._df["timestamp"] < 0].index, inplace=True) # drop negative time - metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index() - metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base + self.metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index() + self.metrics_event_names["metricId"] = self.metrics_event_names["metricId"] + event_type_metrics_base self._df["deviceId"] = self._df["typeId"].apply(lambda x: x & 0xFF) self._df = self._df.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base), 'value': lambda x: list(x), 'deviceId': 'first'}) self._df.reset_index(inplace=True) - return super()._preprocess() \ No newline at end of file + return super()._preprocess() + + def get_names(self): + return self.metrics_event_names \ No newline at end of file -- GitLab From 99a4c89b4e21193111744053ae9417a8d329fdf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Clasc=C3=A0?= Date: Wed, 6 Nov 2024 21:32:19 +0100 Subject: [PATCH 9/9] Solves GPU metrics merging for multireport and multinode systems. Solves #2 --- nsys2prv/parse_nsys_stats.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py index d96b3ae..48858fe 100755 --- a/nsys2prv/parse_nsys_stats.py +++ b/nsys2prv/parse_nsys_stats.py @@ -11,10 +11,10 @@ import locale from functools import reduce from sqlalchemy import create_engine, text, dialects from sqlalchemy.exc import OperationalError -from EventWriter import event_writer as ewr -from NSYSInterface import NSYSInterface -from semantics.mpi_event_encoding import * -from semantics import * +from .EventWriter import event_writer as ewr +from .NSYSInterface import NSYSInterface +from .semantics.mpi_event_encoding import * +from .semantics import * def main(): locale.setlocale(locale.LC_ALL, '') @@ -159,12 +159,6 @@ def main(): # MARK: IMPORT DATASETS print("Importing datasets...") - # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace")) - # kernels_df.rename(columns={"CorrId": "CorrID"}, inplace=True) - # with engine.connect() as conn, conn.begin(): - # with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query: - # kernels_df = pd.read_sql_query(text(query.read()), conn) - kernels_df = [] if MULTIREPORT: sum = 0 @@ -291,7 +285,6 @@ def main(): metrics_event_names = [] if t_metrics: if MULTIREPORT: - #raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!") for REPORT_FILE_I in REPORTS_LIST: ksi = GPUMetricsSemantic(REPORT_FILE_I) ksi.Setup() @@ -345,9 +338,14 @@ def main(): # MARK: CONTEXT INFO list_contexts = [] + list_hostnames = [] if MULTIREPORT: for REPORT_FILE_I in REPORTS_LIST: context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + target_system_env_i = pd.read_sql_table("TARGET_INFO_SYSTEM_ENV", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite") + hostname = target_system_env_i.loc[target_system_env_i["name"] == "Hostname"]["value"].iloc[0] + context_info_i["hostname"] = hostname + list_hostnames.append(hostname) list_contexts.append(context_info_i) context_info = pd.concat(list_contexts) else: @@ -355,6 +353,7 @@ def main(): context_info.sort_values(["processId"], inplace=True) # CONTEXT INFO CHECK FOR MULTIREPORT + #if context_info.groupby(["hostname"]).agg({'deviceId': 'count'}) if context_info["deviceId"].unique().size == 1: print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1]. \033[00m") @@ -425,10 +424,8 @@ def main(): for i, df in enumerate(gpu_metrics_agg): if not df.empty: df['timestamp'] += deltas[i] - # Complement with processId info - #df['Pid'] = list_contexts[i].iloc[0]['processId'] # Assuming one report per GPU device. This needs to be improved. - - df['Pid'] = df['deviceId'].map(context_info.set_index("deviceId")["processId"]) + # Complement with processId and node info + df['Pid'] = df['deviceId'].map(context_info[context_info["hostname"] == list_hostnames[i]].set_index("deviceId")["processId"]) gpu_metrics_agg = pd.concat(gpu_metrics_agg, ignore_index=True) metrics_event_names = pd.concat(metrics_event_names, ignore_index=True).drop_duplicates() @@ -569,7 +566,7 @@ def main(): aux_streams["Strm"] = 99 aux_streams["row_name"] = "Metrics GPU"+aux_streams["deviceid"].astype(str) #aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"]) - aux_streams["thread"] = aux_streams["thread"] + aux_streams["deviceid"].map(tasks_set.set_index('device')['thread']) + 1 + aux_streams["thread"] = aux_streams["thread"] + aux_streams["Pid"].map(tasks_set.set_index('Pid')['thread']) + 1 gpu_metrics_agg["task"] = gpu_metrics_agg["Pid"].map(devices_set.reset_index().set_index("Pid")["task"]) gpu_metrics_agg["thread"] = gpu_metrics_agg["task"].map(aux_streams.set_index('task')["thread"]) streams = pd.concat([streams, aux_streams]).sort_values(['task', 'thread']) -- GitLab