From f9057840bfade6f7292dfb67d8eea7e286eaf3b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Fri, 20 Sep 2024 17:39:20 +0200
Subject: [PATCH 1/9] Adds CLI support for multinode. Adds logic to export data
 for multiple reports. To do this, logic to interact with the nsys binary has
 been refactored to a new class.

---
 nsys2prv/NSYSInterface.py    | 60 ++++++++++++++++++++++++++
 nsys2prv/parse_nsys_stats.py | 81 +++++++++++++-----------------------
 2 files changed, 89 insertions(+), 52 deletions(-)
 create mode 100644 nsys2prv/NSYSInterface.py

diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py
new file mode 100644
index 0000000..29a895c
--- /dev/null
+++ b/nsys2prv/NSYSInterface.py
@@ -0,0 +1,60 @@
+import subprocess
+import os
+
+class NSYSInterface():
+
+    def __init__(self, types, filter_nvtx, range_nvtx, force_sqlite):
+        self.use_path = True
+        self.nsys_binary = ("nsys",)
+        
+        if 'NSYS_HOME' in os.environ:
+            self.NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME'))
+            self.use_path = False
+        
+        if self.use_path:
+            self.nsys_binary = ("nsys",)
+        else:
+            self.nsys_binary = (os.path.join(self.NSYS_HOME, "bin/nsys"),)
+
+        self.types  = types
+        self.filter = filter_nvtx
+        self.range_nvtx = range_nvtx
+        self.force = force_sqlite
+
+    def check_export_report(self, rf):
+        if not os.path.exists(f"{os.path.splitext(os.path.basename(rf))[0]}.sqlite") or self.force:
+            #Try exporting first
+            export_call = self.nsys_binary + ("export", "-t", "sqlite", rf)
+            try:
+                with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
+                    for line in p.stdout:
+                        print(line.decode(), end='')
+                
+                if p.returncode != 0:
+                    raise ChildProcessError(p.returncode, p.args)        
+            except FileNotFoundError:
+                print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
+                exit(1)
+            except ChildProcessError:
+                print("Could not export SQLite database. Exiting.")
+                exit(1)
+
+    def call_stats(self, report):
+        nsys_call = self.nsys_binary + ("stats", "-r", ",".join(self.types), 
+            "--timeunit", "nsec", "-f", "csv", 
+            "--force-overwrite", "true", "-o", ".")
+        if self.filter:
+            nsys_call += ("--filter-nvtx="+self.range_nvtx,)
+
+        nsys_call += (report,)
+
+        try:
+            with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
+                for line in p.stdout:
+                    print(line.decode(), end='')
+
+            if p.returncode != 0:
+                raise ChildProcessError(p.returncode, p.args)
+        except FileNotFoundError:
+            print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
+            exit(1)
\ No newline at end of file
diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 8a25660..01882df 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -11,9 +11,9 @@ import locale
 from sqlalchemy import create_engine, text, dialects
 from sqlalchemy.exc import OperationalError
 from .EventWriter import event_writer as ewr
+from .NSYSInterface import NSYSInterface
 from .semantics.mpi_event_encoding import *
 
-
 def main():
     locale.setlocale(locale.LC_ALL, '')
 
@@ -31,6 +31,7 @@ def main():
     parser.add_argument("-v", "--version",  nargs=0, help="Show version and exit.", action=ShowVersion)
     parser.add_argument("-f", "--filter-nvtx", help="Filter by this NVTX range")
     parser.add_argument("-t", "--trace", help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, gpu_metrics, openacc]")
+    parser.add_argument("-m", "--multi-report", action="store_true", help="Translate multiple reports of the same execution into one trace.")
 
     parser.add_argument("--force-sqlite", action="store_true", help="Force Nsight System to export SQLite database")
 
@@ -39,23 +40,24 @@ def main():
 
     #parser.add_argument("-n", "--nvtx-stack-range", nargs=2, type=int)
 
-    parser.add_argument("source_rep", help="Nsight source report file")
+    parser.add_argument("source_rep", nargs="+", help="Nsight source report file")
     parser.add_argument("output", help="Paraver output trace name")
 
     args = parser.parse_args()
 
     # # Trace configuration and setup
-
-    use_path = True
-
-    if 'NSYS_HOME' in os.environ:
-        NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME'))
-        use_path = False
     
     PARAVER_HOME = os.getenv('PARAVER_HOME')
 
-    REPORT_FILE = os.path.abspath(args.source_rep)
-    REPORT_DIR = os.path.dirname(REPORT_FILE)
+    MULTIREPORT = args.multi_report
+    if MULTIREPORT:
+        REPORTS_LIST = [os.path.abspath(x) for x in args.source_rep]
+        REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]
+        REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report
+    else:
+        REPORT_FILE = os.path.abspath(args.source_rep.first)
+        REPORT_DIR = os.path.dirname(REPORT_FILE)
+    
     trace_name = args.output
 
     NVTX_FILTER = args.filter_nvtx != None
@@ -135,28 +137,18 @@ def main():
             return os.path.join(REPORT_DIR, base_name+"_{}.csv".format(report_name))
 
 
+    nsi = NSYSInterface(reports, NVTX_FILTER, NVTX_RANGE, args.force_sqlite)
+
+    if MULTIREPORT:
+        print(f"Multiple reports provided: {REPORTS_LIST}")
     print("Extracting reports for: {}".format(reports_og))
-    if use_path:
-        nsys_binary = ("nsys",)
-    else:
-        nsys_binary = (os.path.join(NSYS_HOME, "bin/nsys"),)
     
-    if not os.path.exists(f"{os.path.splitext(os.path.basename(REPORT_FILE))[0]}.sqlite"):
-        #Try exporting first
-        export_call = nsys_binary + ("export", "-t", "sqlite", REPORT_FILE)
-        try:
-            with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
-                for line in p.stdout:
-                    print(line.decode(), end='')
-            
-            if p.returncode != 0:
-                raise ChildProcessError(p.returncode, p.args)        
-        except FileNotFoundError:
-            print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
-            exit(1)
-        except ChildProcessError:
-            print("Could not export SQLite database. Exiting.")
-            exit(1)
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            print(f"Exporting SQLite databse for {os.path.basename(REPORT_FILE_I)}")
+            nsi.check_export_report(REPORT_FILE_I)
+    else:
+        nsi.check_export_report(REPORT_FILE)
 
     engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
     metadata = pd.read_sql_table("META_DATA_EXPORT", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
@@ -164,29 +156,14 @@ def main():
     if int(minor_version["value"].iloc[0]) > 11:
         print(f"\033[93m Warning! The SQLite schema version {int(minor_version["value"].iloc[0])} is greater than the one supported (11). If unexpected behaviour occurs, please report it. \033[00m")
 
-    nsys_call = nsys_binary + ("stats", "-r", ",".join(reports), 
-                "--timeunit", "nsec", "-f", "csv", 
-                "--force-overwrite", "true", "-o", ".")
-
-    if NVTX_FILTER:
-        nsys_call += ("--filter-nvtx="+NVTX_RANGE,)
-
-    if args.force_sqlite:
-        nsys_call += ("--force-export", "true")
-
-    nsys_call += (REPORT_FILE,)
-
-    try:
-        with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
-            for line in p.stdout:
-                print(line.decode(), end='')
-
-        if p.returncode != 0:
-            raise ChildProcessError(p.returncode, p.args)
-    except FileNotFoundError:
-        print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
-        exit(1)
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            print(f"Processing stats for {os.path.basename(REPORT_FILE_I)}")
+            nsi.call_stats(REPORT_FILE_I)
+    else:
+        nsi.call_stats(REPORT_FILE)
 
+    assert(False)
     print("Importing datasets...")
 
     # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace"))
-- 
GitLab


From 7e8dc181eb6776dbbd8d085aeaf9d4ecb8093873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Mon, 23 Sep 2024 18:59:30 +0200
Subject: [PATCH 2/9] Collection of events dataframes now is offloaded to
 custom classes that contain a setup function for SQL data loading and data
 preprocessing. Added collection of events for multiple reports (WIP).

---
 nsys2prv/NSYSInterface.py                    |   9 +-
 nsys2prv/parse_nsys_stats.py                 | 220 ++++++++++++-------
 nsys2prv/semantics/__init__.py               |   4 +
 nsys2prv/semantics/gpu_metrics_semantic.py   |  28 +++
 nsys2prv/semantics/kernels_semantic.py       |  12 +
 nsys2prv/semantics/mpi_semantic.py           |  78 +++++++
 nsys2prv/semantics/nsys_event.py             |  68 ++++++
 nsys2prv/semantics/nvtx_startend_semantic.py |  11 +
 8 files changed, 345 insertions(+), 85 deletions(-)
 create mode 100644 nsys2prv/semantics/__init__.py
 create mode 100644 nsys2prv/semantics/gpu_metrics_semantic.py
 create mode 100644 nsys2prv/semantics/kernels_semantic.py
 create mode 100644 nsys2prv/semantics/mpi_semantic.py
 create mode 100644 nsys2prv/semantics/nsys_event.py
 create mode 100644 nsys2prv/semantics/nvtx_startend_semantic.py

diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py
index 29a895c..9034cf7 100644
--- a/nsys2prv/NSYSInterface.py
+++ b/nsys2prv/NSYSInterface.py
@@ -57,4 +57,11 @@ class NSYSInterface():
                 raise ChildProcessError(p.returncode, p.args)
         except FileNotFoundError:
             print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
-            exit(1)
\ No newline at end of file
+            exit(1)
+
+    def build_nsys_stats_name(self, rf, rd, report_name):
+        base_name = os.path.splitext(os.path.basename(rf))[0]
+        if self.filter:
+            return os.path.join(rd, base_name+"_{}_nvtx={}.csv".format(report_name, self.range_nvtx))
+        else:
+            return os.path.join(rd, base_name+"_{}.csv".format(report_name))
\ No newline at end of file
diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 01882df..5d108e0 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -13,6 +13,7 @@ from sqlalchemy.exc import OperationalError
 from .EventWriter import event_writer as ewr
 from .NSYSInterface import NSYSInterface
 from .semantics.mpi_event_encoding import *
+from .semantics import *
 
 def main():
     locale.setlocale(locale.LC_ALL, '')
@@ -128,15 +129,6 @@ def main():
     nvtx_stack_top = 1
     nvtx_stack_bottom = 4
 
-
-    def build_nsys_stats_name(report_name):
-        base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]
-        if NVTX_FILTER:
-            return os.path.join(REPORT_DIR, base_name+"_{}_nvtx={}.csv".format(report_name, NVTX_RANGE))
-        else:
-            return os.path.join(REPORT_DIR, base_name+"_{}.csv".format(report_name))
-
-
     nsi = NSYSInterface(reports, NVTX_FILTER, NVTX_RANGE, args.force_sqlite)
 
     if MULTIREPORT:
@@ -163,103 +155,139 @@ def main():
     else:
         nsi.call_stats(REPORT_FILE)
 
-    assert(False)
+    #assert(False)
     print("Importing datasets...")
 
     # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace"))
     # kernels_df.rename(columns={"CorrId": "CorrID"}, inplace=True)
-    with engine.connect() as conn, conn.begin():
-        with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query:
-            kernels_df = pd.read_sql_query(text(query.read()), conn)
+    # with engine.connect() as conn, conn.begin():
+    #     with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query:
+    #         kernels_df = pd.read_sql_query(text(query.read()), conn)
+
+    kernels_df = []
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            ksi = KernelsSemantic(REPORT_FILE_I)
+            ksi.Setup()
+            ksi.load_data()
+            kernels_df.append(ksi.get_df())
+            del ksi
+    else:
+        ks = KernelsSemantic(REPORT_FILE)
+        ks.Setup()
+        ks.load_data()
+        kernels_df = ks.get_df()
 
 
     if t_apicalls:
-        cuda_api_df = pd.read_csv(build_nsys_stats_name("cuda_api_trace"))
+        cuda_api_df = []
+        if MULTIREPORT:
+            for i, REPORT_FILE_I in enumerate(REPORTS_LIST):
+                cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "cuda_api_trace")))
+        else:
+            cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "cuda_api_trace"))
     else:
         cuda_api_df = pd.DataFrame()
 
     if t_nvtx:
-        nvtx_df = pd.read_csv(build_nsys_stats_name("nvtx_pushpop_trace"))
-        nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0]
+        if MULTIREPORT:
+            for i, REPORT_FILE_I in enumerate(REPORTS_LIST):
+                nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace")))
+                nvtx_df[i]["domain"] = nvtx_df["Name"].str.split(":").str[0]
+        else:
+            nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace"))
+            nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0]
     else:
         nvtx_df = pd.DataFrame()
 
     if t_nvtx_startend:
-        with engine.connect() as conn, conn.begin():
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/nvtx_startend_trace.sql'), 'r') as query:
-                nvtx_startend_df = pd.read_sql_query(text(query.read()), conn)
+        if MULTIREPORT:
+            for REPORT_FILE_I in REPORTS_LIST:
+                ksi = NVTXStartEndSemantic(REPORT_FILE_I)
+                ksi.Setup()
+                ksi.load_data()
+                nvtx_startend_df.append(ksi.get_df())
+                del ksi
+        else:
+            ks = NVTXStartEndSemantic(REPORT_FILE)
+            ks.Setup()
+            ks.load_data()
+            nvtx_startend_df = ks.get_df()
+            del ks
     else:
         nvtx_startend_df = pd.DataFrame()
 
     if t_mpi:
-        with engine.connect() as conn, conn.begin():
-            try:
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_p2p.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_P2P_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_START_WAIT_EVENTS'):
-                        mpi_p2p_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_p2p_df["event_type"] = MPITYPE_PTOP
-                    else: mpi_p2p_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_coll.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'):
-                        mpi_coll_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_coll_df = mpi_coll_df.drop(mpi_coll_df[mpi_coll_df["Event"].str.contains("File") ].index)
-                        mpi_coll_df["event_type"] = MPITYPE_COLLECTIVE
-                    else: mpi_coll_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'):
-                        mpi_other_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("File") ].index)
-                        mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index)
-                        mpi_other_df["event_type"] = MPITYPE_OTHER
-                    else: mpi_other_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'):
-                        mpi_rma_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_rma_df = mpi_rma_df[mpi_rma_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")]
-                        mpi_rma_df["event_type"] = MPITYPE_RMA
-                    else: mpi_rma_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_io.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'):
-                        mpi_io_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_io_df = mpi_io_df[mpi_io_df["Event"].str.contains("File")]
-                        mpi_io_df["event_type"] = MPITYPE_IO
-                    else: mpi_io_df = pd.DataFrame()
-                mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df])
-            except OperationalError as oe:
-                print("There has been a problem fetching MPI information. MPI data will be skipped.")
-                print(f"[ERROR]: {oe.detail}")
-                t_mpi = False
-        #mpi_df = pd.read_csv(build_nsys_stats_name("mpi_event_trace"))
+        mpi_df = []
+        if MULTIREPORT:
+            for REPORT_FILE_I in REPORTS_LIST:
+                kp2pi = MPIP2PSemantic(REPORT_FILE_I)
+                kp2pi.Setup()
+                kp2pi.load_data()
+
+                kcolli = MPICollSemantic(REPORT_FILE_I)
+                kcolli.Setup()
+                kcolli.load_data()
+
+                kotheri = MPIOtherSemantic(REPORT_FILE_I)
+                kotheri.Setup()
+                kotheri.load_data()
+
+                krmai = MPIRMASemantic(REPORT_FILE_I)
+                krmai.Setup()
+                krmai.load_data()
+
+                kioi = MPIIOPSemantic(REPORT_FILE_I)
+                kioi.Setup()
+                kioi.load_data()
+
+                mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()]))
+            del kp2pi, kcolli, kotheri, krmai, kioi
+        else:
+            kmpi = MPIP2PSemantic(REPORT_FILE)
+            kmpi.Setup()
+            kmpi.load_data()
+            mpi_p2p_df = kmpi.get_df()
+
+            kmpi = MPICollSemantic(REPORT_FILE)
+            kmpi.Setup()
+            kmpi.load_data()
+            mpi_coll_df = kmpi.get_df()
+
+            kmpi = MPIOtherSemantic(REPORT_FILE)
+            kmpi.Setup()
+            kmpi.load_data()
+            mpi_other_df = kmpi.get_df()
+
+            kmpi = MPIRMASemantic(REPORT_FILE)
+            kmpi.Setup()
+            kmpi.load_data()
+            mpi_rma_df = kmpi.get_df()
+
+            kmpi = MPIIOPSemantic(REPORT_FILE)
+            kmpi.Setup()
+            kmpi.load_data()
+            mpi_io_df = kmpi.get_df()
+            mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df])
+            del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df
     else:
-        #mpi_df = pd.DataFrame()
-        mpi_p2p_df = pd.DataFrame()
-        mpi_coll_df = pd.DataFrame()
-        mpi_other_df = pd.DataFrame()
-        mpi_rma_df = pd.DataFrame()
-        mpi_io_df = pd.DataFrame()
-
-    # Obtain context Info
-    context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
-    if t_mpi:
-        mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;"
-        with engine.connect() as conn, conn.begin():
-            rank_info = pd.read_sql_query(mpi_query, conn)
-    
-    context_info.sort_values(["processId"], inplace=True)
+        mpi_df = pd.DataFrame()
 
+    gpu_metrics_agg = []
     if t_metrics:
-        gpu_metrics = pd.read_sql_table("GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
-        metrics_description = pd.read_sql_table("TARGET_INFO_GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
-        gpu_metrics.drop(gpu_metrics[gpu_metrics["timestamp"] < 0].index, inplace=True) # drop negative time
-        metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index()
-        metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base
-        #gpu_metrics["task"] = gpu_metrics.groupby(["typeId"]).ngroup() + 1
-        gpu_metrics["deviceId"] = gpu_metrics["typeId"].apply(lambda x: x & 0xFF)
-        gpu_metrics_agg = gpu_metrics.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base),
-                                                                        'value': lambda x: list(x),
-                                                                        'deviceId': 'first'})
-        gpu_metrics_agg.reset_index(inplace=True)
-
+        if MULTIREPORT:
+            for REPORT_FILE_I in REPORTS_LIST:
+                ksi = GPUMetricsSemantic(REPORT_FILE_I)
+                ksi.Setup()
+                ksi.load_data()
+                gpu_metrics_agg.append(ksi.get_df())
+                del ksi
+        else:
+            ks = GPUMetricsSemantic(REPORT_FILE)
+            ks.Setup()
+            ks.load_data()
+            gpu_metrics_agg = ks.get_df()
+            del ks
 
     if t_openacc:
         with engine.connect() as conn, conn.begin():
@@ -272,6 +300,30 @@ def main():
             openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn)
 
 
+    # Obtain context Info
+    list_contexts = []
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+            list_contexts.append(context_info_i)
+        context_info = pd.concat(list_contexts)
+        print(context_info)
+    else:
+        context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
+    context_info.sort_values(["processId"], inplace=True)
+    
+    if t_mpi:
+        if MULTIREPORT:
+            list_ranks = []
+            for REPORT_FILE_I in REPORTS_LIST:
+                mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;"
+                engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+                with engine.connect() as conn, conn.begin():
+                    list_ranks.append(pd.read_sql_query(mpi_query, conn))
+            rank_info = pd.concat(list_ranks)
+            print(rank_info)
+    
+    assert(False)
     # # Building object model
 
     # ## Tasks and threads
diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py
new file mode 100644
index 0000000..460fd05
--- /dev/null
+++ b/nsys2prv/semantics/__init__.py
@@ -0,0 +1,4 @@
+from .kernels_semantic import KernelsSemantic
+from .mpi_semantic import *
+from .nvtx_startend_semantic import NVTXStartEndSemantic
+from .gpu_metrics_semantic import GPUMetricsSemantic
\ No newline at end of file
diff --git a/nsys2prv/semantics/gpu_metrics_semantic.py b/nsys2prv/semantics/gpu_metrics_semantic.py
new file mode 100644
index 0000000..097682a
--- /dev/null
+++ b/nsys2prv/semantics/gpu_metrics_semantic.py
@@ -0,0 +1,28 @@
+from .nsys_event import NsysEvent
+from pandas import read_sql_table
+from sqlalchemy import text
+
+event_type_metrics_base = 9400
+
+
+class GPUMetricsSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("GPU_METRICS"):
+            self.query = text("SELECT * FROM GPU_METRICS")
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        metrics_description = read_sql_table("TARGET_INFO_GPU_METRICS", self._dbcon)
+        self._df.drop(self._df[self._df["timestamp"] < 0].index, inplace=True) # drop negative time
+        metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index()
+        metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base
+        self._df["deviceId"] = self._df["typeId"].apply(lambda x: x & 0xFF)
+        self._df = self._df.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base),
+                                                                        'value': lambda x: list(x),
+                                                                        'deviceId': 'first'})
+        self._df.reset_index(inplace=True)
+        return super()._preprocess()
\ No newline at end of file
diff --git a/nsys2prv/semantics/kernels_semantic.py b/nsys2prv/semantics/kernels_semantic.py
new file mode 100644
index 0000000..804128e
--- /dev/null
+++ b/nsys2prv/semantics/kernels_semantic.py
@@ -0,0 +1,12 @@
+from .nsys_event import NsysEvent
+import os.path
+from sqlalchemy import text
+
+class KernelsSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/kernels.sql'), 'r') as query:
+            self.query = text(query.read())
+    
\ No newline at end of file
diff --git a/nsys2prv/semantics/mpi_semantic.py b/nsys2prv/semantics/mpi_semantic.py
new file mode 100644
index 0000000..a476376
--- /dev/null
+++ b/nsys2prv/semantics/mpi_semantic.py
@@ -0,0 +1,78 @@
+from .nsys_event import NsysEvent
+import os.path
+from .mpi_event_encoding import *
+from sqlalchemy import text
+
+class MPIP2PSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table('MPI_P2P_EVENTS') and self.check_table('MPI_START_WAIT_EVENTS'):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_p2p.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    def _preprocess(self):
+        self._df["event_type"] = MPITYPE_PTOP
+        return super()._preprocess()
+
+class MPICollSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_COLLECTIVES_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_coll.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index)
+        self._df["event_type"] = MPITYPE_COLLECTIVE
+
+class MPIOtherSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_OTHER_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index)
+        self._df = self._df.drop(self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index)
+        self._df["event_type"] = MPITYPE_OTHER
+
+class MPIRMASemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_OTHER_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    def _preprocess(self):
+        self._df = self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")]
+        self._df["event_type"] = MPITYPE_RMA
+
+class MPIIOPSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_OTHER_EVENTS") and self.check_table("MPI_COLLECTIVES_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_io.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        self._df = self._df[self._df["Event"].str.contains("File")]
+        self._df["event_type"] = MPITYPE_IO
\ No newline at end of file
diff --git a/nsys2prv/semantics/nsys_event.py b/nsys2prv/semantics/nsys_event.py
new file mode 100644
index 0000000..9813a14
--- /dev/null
+++ b/nsys2prv/semantics/nsys_event.py
@@ -0,0 +1,68 @@
+from sqlalchemy import create_engine, exc, inspect
+import pandas as pd
+import os.path
+
+class NsysEvent:
+
+    class MissingDatabaseFile(Exception):
+        def __init__(self, filename):
+            super().__init__(f'Database file {filename} does not exist.')
+
+    class InvalidDatabaseFile(Exception):
+        def __init__(self, filename):
+            super().__init__(f'Database file {filename} could not be opened and appears to be invalid.')
+
+    class InvalidSQL(Exception):
+        def __init__(self, sql):
+            super().__init__(f'Bad SQL statement: {sql}')
+
+    query = "SELECT 1 AS 'ONE'"
+
+    def __init__(self, report) -> None:
+        self._dbcon = None
+        self._dbfile = f"{os.path.splitext(report)[0]}.sqlite"
+        self._df = pd.DataFrame()
+        self._empty = False
+
+        if not os.path.exists(self._dbfile):
+            raise self.MissingDatabaseFile(self._dbfile)
+
+        try:
+            self._dbcon = create_engine(f"sqlite:///{self._dbfile}")
+        except exc.SQLAlchemyError:
+            self._dbcon = None
+            raise self.InvalidDatabaseFile(self._dbfile)
+        
+    def check_table(self, table_name):
+        insp = inspect(self._dbcon)
+        return insp.has_table(table_name)
+
+    def Setup(self):
+        pass
+
+    def _preprocess(self):
+        pass
+
+    def postprocess(self):
+        pass
+
+    def load_data(self):
+        if not self._empty:
+            try:
+                self._df = pd.read_sql_query(self.query, self._dbcon)
+            except pd.errors.DatabaseError:
+                raise self.InvalidSQL(self.query)
+            self._preprocess()
+
+    def apply_process_model(self, threads=pd.DataFrame, streams=pd.DataFrame):
+        self.df["thread"] = self.df["Tid"].map(threads.set_index('Tid')["thread"])
+        self.df["task"] = self.df["Tid"].map(threads.set_index('Tid')["task"])
+        if 'Rank' in threads.columns:
+            self.df["Rank"] = self.df["Tid"].map(threads.set_index('Tid')["Rank"])
+        pass
+
+    def get_threads(self):
+        return self._df[['Pid', 'Tid']].drop_duplicates()
+    
+    def get_df(self):
+        return self._df.copy()
\ No newline at end of file
diff --git a/nsys2prv/semantics/nvtx_startend_semantic.py b/nsys2prv/semantics/nvtx_startend_semantic.py
new file mode 100644
index 0000000..88241b7
--- /dev/null
+++ b/nsys2prv/semantics/nvtx_startend_semantic.py
@@ -0,0 +1,11 @@
+from .nsys_event import NsysEvent
+import os.path
+from sqlalchemy import text
+
+class NVTXStartEndSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_startend_trace.sql'), 'r') as query:
+            self.query = text(query.read())
\ No newline at end of file
-- 
GitLab


From e512afd58ee86209f6153b912d2c8810d244c6a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Wed, 25 Sep 2024 16:56:26 +0200
Subject: [PATCH 3/9] Merges dataframes of multiple reports for different
 events into single one. Adapts process model construction to support multiple
 devices on different machines with same device ids. PROVISIONAL TRANSLATION
 OF MULTIREPORT, MULTINODE TRACE WORKING.

---
 nsys2prv/parse_nsys_stats.py | 56 ++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 5d108e0..4368380 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -194,9 +194,13 @@ def main():
             for i, REPORT_FILE_I in enumerate(REPORTS_LIST):
                 nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace")))
                 nvtx_df[i]["domain"] = nvtx_df["Name"].str.split(":").str[0]
+                nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
+
         else:
             nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace"))
             nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0]
+            nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
+
     else:
         nvtx_df = pd.DataFrame()
 
@@ -307,11 +311,14 @@ def main():
             context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
             list_contexts.append(context_info_i)
         context_info = pd.concat(list_contexts)
-        print(context_info)
     else:
         context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
     context_info.sort_values(["processId"], inplace=True)
     
+    ## CONTEXT INFO CHECK FOR MULTIREPORT
+    if context_info["deviceId"].unique().size == 0:
+        print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \033[00m")
+
     if t_mpi:
         if MULTIREPORT:
             list_ranks = []
@@ -321,9 +328,28 @@ def main():
                 with engine.connect() as conn, conn.begin():
                     list_ranks.append(pd.read_sql_query(mpi_query, conn))
             rank_info = pd.concat(list_ranks)
-            print(rank_info)
     
-    assert(False)
+
+    # MERGING AND SYNCHRONIZATION
+    if MULTIREPORT:
+        kernels_df = pd.concat(kernels_df)
+        if t_apicalls:
+            cuda_api_df = pd.concat(cuda_api_df)
+
+        if t_nvtx:
+            nvtx_df = pd.concat(nvtx_df)
+
+        if t_nvtx_startend:
+            nvtx_startend_df = pd.concat(nvtx_startend_df)
+
+        if t_mpi:
+            mpi_df = pd.concat(mpi_df)
+        
+        #if t_metrics:
+
+        #if t_openacc:
+            
+    
     # # Building object model
 
     # ## Tasks and threads
@@ -335,7 +361,6 @@ def main():
     if t_mpi: print("MPI calls unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique()))
     if t_openacc: print("OpenACC calls unique processes: {}, and unique threads: {}".format(openacc_other_df["Pid"].unique(), openacc_other_df["Tid"].unique()))
 
-    if t_nvtx: nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
 
     compute_threads_with = []
     if t_apicalls: compute_threads_with.append(cuda_api_df[['Pid', 'Tid']])
@@ -361,6 +386,7 @@ def main():
                                         'Tid': lambda x: set(x),
                                             'thread': 'count',
                                             'device': 'first' })
+    print(tasks_set)
 
     cuda_api_df["thread"] = 0
     cuda_api_df["task"] = 0
@@ -418,41 +444,44 @@ def main():
 
 
     streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates()
-    streams["thread"] = streams.groupby(["Device"]).cumcount() + 1
+    streams["thread"] = streams.groupby(["Pid", "Device"]).cumcount() + 1
     #streams["deviceid"] = streams.sort_values("Device").groupby(["Device"]).ngroup()
     #streams["Pid"] = streams["deviceid"].map(tasks_set.set_index("device")["Pid"])
-    streams["task"] = streams["deviceid"].map(tasks_set.reset_index().set_index("device")["task"])
+    streams["task"] = streams["Pid"].map(tasks_set.reset_index().set_index("Pid")["task"])
 
     streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str)
     num_streams = streams.count().iloc[0]
     streams.sort_values(["Pid", "thread"], inplace=True)
     streams.reset_index(inplace=True)
 
-    devices_set = streams.groupby(["deviceid"]).agg({'Device': 'first',
+    devices_set = streams.groupby(["Pid", "deviceid"]).agg({'Device': 'first',
                                         'Strm': lambda x: set(x),
                                             'thread': 'count',
                                             'task': 'first',
                                             'Pid': 'last'})
+    print(devices_set)
 
     # Here we finally update the threadId we are going to put in the event record of kernel executions to respect the normal threads before CUDA streams
 
     num_normal_threads = tasks_set['thread']
     num_normal_threads_repeated = num_normal_threads.repeat(devices_set["thread"]).reset_index()[["thread"]]
 
-
     streams['thread'] = streams['thread'] + num_normal_threads_repeated["thread"]
+
     # for index,row in kernels_df.iterrows():
     #     kernels_df.at[index, "thread"] = streams.at[(streams["Strm"] == row["Strm"]).idxmax(), "thread"]
     #     kernels_df.at[index, "deviceid"] = streams.at[(streams["Device"] == row["Device"]).idxmax(), "deviceid"]
 
     # More efficient way by chatgpt
     # First, let's filter streams DataFrame based on conditions
-    filtered_streams = streams.groupby(["Device", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index()
+    filtered_streams = streams.groupby(["Pid", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index()
     # Now, merge the filtered streams DataFrame with kernels_df
-    result_df = kernels_df.merge(filtered_streams, how='left', on=['Device', 'Strm'])
+    result_df = kernels_df.merge(filtered_streams, how='left', on=["Pid", 'Strm'])
+
     # Copy the results back to kernels_df
-    kernels_df['thread'] = result_df['thread']
-    kernels_df['task'] = result_df['task']
+    kernels_df['thread'] = result_df['thread'].to_numpy()
+    kernels_df['task'] = result_df['task'].to_numpy()
+
 
     # Add auxiliary stream to streams dataframe
     if t_metrics:
@@ -470,9 +499,6 @@ def main():
     # ## Writing ROW file
     # Now we can write the _row_ file with this information
 
-    print(tasks_set)
-    print(devices_set)
-
     print("  -Writing resource model to row file...")
 
     row_df = pd.concat([threads[["thread", "task", "row_name"]], streams[["thread", "task", "row_name"]]])
-- 
GitLab


From 1dc011a98bc236cf4900c13920d3304e7a07a88b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Fri, 27 Sep 2024 13:39:35 +0200
Subject: [PATCH 4/9] Solves problem that removed some kernels when merging
 different reports. Now concatenating dataframes without indexes.

---
 nsys2prv/parse_nsys_stats.py |  71 +++++---
 parser-playground.ipynb      | 310 ++++++++++++++++++++---------------
 2 files changed, 229 insertions(+), 152 deletions(-)

diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 4368380..0490046 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -8,6 +8,7 @@ import time
 import subprocess
 import os
 import locale
+from functools import reduce
 from sqlalchemy import create_engine, text, dialects
 from sqlalchemy.exc import OperationalError
 from .EventWriter import event_writer as ewr
@@ -155,7 +156,7 @@ def main():
     else:
         nsi.call_stats(REPORT_FILE)
 
-    #assert(False)
+    # MARK: IMPORT DATASETS
     print("Importing datasets...")
 
     # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace"))
@@ -166,11 +167,13 @@ def main():
 
     kernels_df = []
     if MULTIREPORT:
+        sum = 0
         for REPORT_FILE_I in REPORTS_LIST:
             ksi = KernelsSemantic(REPORT_FILE_I)
             ksi.Setup()
             ksi.load_data()
             kernels_df.append(ksi.get_df())
+            sum += ksi.get_df().shape[0]
             del ksi
     else:
         ks = KernelsSemantic(REPORT_FILE)
@@ -190,10 +193,11 @@ def main():
         cuda_api_df = pd.DataFrame()
 
     if t_nvtx:
+        nvtx_df = []
         if MULTIREPORT:
             for i, REPORT_FILE_I in enumerate(REPORTS_LIST):
                 nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace")))
-                nvtx_df[i]["domain"] = nvtx_df["Name"].str.split(":").str[0]
+                nvtx_df[i]["domain"] = nvtx_df[i]["Name"].str.split(":").str[0]
                 nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
 
         else:
@@ -205,6 +209,7 @@ def main():
         nvtx_df = pd.DataFrame()
 
     if t_nvtx_startend:
+        nvtx_startend_df = []
         if MULTIREPORT:
             for REPORT_FILE_I in REPORTS_LIST:
                 ksi = NVTXStartEndSemantic(REPORT_FILE_I)
@@ -245,7 +250,7 @@ def main():
                 kioi.Setup()
                 kioi.load_data()
 
-                mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()]))
+                mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True))
             del kp2pi, kcolli, kotheri, krmai, kioi
         else:
             kmpi = MPIP2PSemantic(REPORT_FILE)
@@ -272,7 +277,7 @@ def main():
             kmpi.Setup()
             kmpi.load_data()
             mpi_io_df = kmpi.get_df()
-            mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df])
+            mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True)
             del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df
     else:
         mpi_df = pd.DataFrame()
@@ -304,7 +309,7 @@ def main():
             openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn)
 
 
-    # Obtain context Info
+    # MARK: CONTEXT INFO
     list_contexts = []
     if MULTIREPORT:
         for REPORT_FILE_I in REPORTS_LIST:
@@ -315,7 +320,7 @@ def main():
         context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
     context_info.sort_values(["processId"], inplace=True)
     
-    ## CONTEXT INFO CHECK FOR MULTIREPORT
+    # CONTEXT INFO CHECK FOR MULTIREPORT
     if context_info["deviceId"].unique().size == 0:
         print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \033[00m")
 
@@ -330,33 +335,57 @@ def main():
             rank_info = pd.concat(list_ranks)
     
 
-    # MERGING AND SYNCHRONIZATION
+    # MARK: MERGING AND ALIGNING
     if MULTIREPORT:
-        kernels_df = pd.concat(kernels_df)
+        # Find delta between earliest trace start and the others
+        session_time = []
+        for REPORT_FILE_I in REPORTS_LIST:
+            session_time.append(pd.read_sql_table("TARGET_INFO_SESSION_START_TIME", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite"))
+        
+        session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs
+        earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf'))
+        deltas = [start - earliest_time for start in session_time]
+        for i, df in enumerate(kernels_df):
+            df['Start (ns)'] += deltas[i]
+        kernels_df = pd.concat(kernels_df, ignore_index=True)
+        print(f"After concat: {kernels_df.shape}")
+
         if t_apicalls:
-            cuda_api_df = pd.concat(cuda_api_df)
+            for i, df in enumerate(cuda_api_df):
+                df['Start (ns)'] += deltas[i]
+
+            cuda_api_df = pd.concat(cuda_api_df, ignore_index=True)
 
         if t_nvtx:
-            nvtx_df = pd.concat(nvtx_df)
+            for i, df in enumerate(nvtx_df):
+                df['Start (ns)'] += deltas[i]
+                df['End (ns)'] += deltas[i]
+            nvtx_df = pd.concat(nvtx_df, ignore_index=True)
 
         if t_nvtx_startend:
-            nvtx_startend_df = pd.concat(nvtx_startend_df)
+            for i, df in enumerate(nvtx_startend_df):
+                df['Start (ns)'] += deltas[i]
+                df['End (ns)'] += deltas[i]
+            nvtx_startend_df = pd.concat(nvtx_startend_df, ignore_index=True)
 
         if t_mpi:
-            mpi_df = pd.concat(mpi_df)
+            for i, df in enumerate(mpi_df):
+                df['Start:ts_ns'] += deltas[i]
+                df['End:ts_ns'] += deltas[i]
+            mpi_df = pd.concat(mpi_df, ignore_index=True)
         
         #if t_metrics:
 
         #if t_openacc:
-            
-    
-    # # Building object model
+        
+
+    # MARK: PROCESS MODEL
 
     # ## Tasks and threads
     # Now, find unique appearences of ThreadID and ProcessID
 
     if t_apicalls: print("CUDA calls unique processes: {}, and unique threads: {}".format(cuda_api_df["Pid"].unique(), cuda_api_df["Tid"].unique()))
-    if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["PID"].unique(), nvtx_df["TID"].unique()))
+    if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["Pid"].unique(), nvtx_df["Tid"].unique()))
     if t_nvtx_startend: print("NVTX startend unique processes: {}, and unique threads: {}".format(nvtx_startend_df["Pid"].unique(), nvtx_startend_df["Tid"].unique()))
     if t_mpi: print("MPI calls unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique()))
     if t_openacc: print("OpenACC calls unique processes: {}, and unique threads: {}".format(openacc_other_df["Pid"].unique(), openacc_other_df["Tid"].unique()))
@@ -482,7 +511,6 @@ def main():
     kernels_df['thread'] = result_df['thread'].to_numpy()
     kernels_df['task'] = result_df['task'].to_numpy()
 
-
     # Add auxiliary stream to streams dataframe
     if t_metrics:
         aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task"]]
@@ -515,7 +543,7 @@ def main():
         row_file.write("\n")
 
 
-    # # Collecting event values
+    # MARK: EVENT NAMES
     # Second step is collect all different event values for CUDA API calls, kernel names, and NVTX ranges.  Each of these define a different event type, and will need unique identifiers to be used as a event values.  Finally these needs to be dumped to the PCF file.
 
     print("Collecting event names and information...")
@@ -545,7 +573,7 @@ def main():
     kernel_names["Name"] = kernel_names["Name"].apply(lambda x: x.replace("[", "").replace("]", ""))
 
     if t_nvtx:
-        nvtx_df_subset = nvtx_df
+        nvtx_df_subset = nvtx_df.reset_index()
         lower_level = max(nvtx_df["Lvl"])
 
         if nvtx_select_frames:
@@ -846,9 +874,9 @@ GRADIENT_NAMES
                 pcf_file.write("{} {}\n".format(row["func_value"], row["func"]))
             pcf_file.write("\n")
 
+    # MARK: MEMORY
     # # Split of kernel execution between compute and memory
 
-
     memops_names = ["[CUDA memcpy Device-to-Device]", "[CUDA memcpy Device-to-Host]", "[CUDA memcpy Host-to-Device]", "[CUDA memset]", "[CUDA memcpy Peer-to-Peer]"]
     memops_df = kernels_df.loc[kernels_df["Name"].isin(memops_names)]
     mask = ~kernels_df.index.isin(memops_df.index)
@@ -860,6 +888,7 @@ GRADIENT_NAMES
     comm_memory_df = cuda_api_df.merge(memops_df, how="inner", left_on=["CorrID", "task"], right_on=["CorrID", "task"], suffixes=("_call", "_mem"), validate="one_to_one")
 
 
+    # MARK: TIMELINE RECONS
     # # Timeline reconstruction
 
     print("Reconstructing timeline...")
@@ -979,6 +1008,8 @@ GRADIENT_NAMES
 
 
     print(f"Congratulations! Trace {trace_name}.prv correctly translated.")
+    
+    # MARK: POSTPROCESSING
     # ## Postprocessing
     # - Reorder trace
     # - GZip trace
diff --git a/parser-playground.ipynb b/parser-playground.ipynb
index 10a2412..cb91839 100644
--- a/parser-playground.ipynb
+++ b/parser-playground.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,6 +14,10 @@
     "import locale\n",
     "import sqlite3\n",
     "from sqlalchemy import create_engine, text\n",
+    "from nsys2prv.EventWriter import event_writer as ewr\n",
+    "from nsys2prv.NSYSInterface import NSYSInterface\n",
+    "from nsys2prv.semantics.mpi_event_encoding import *\n",
+    "from nsys2prv.semantics import *\n",
     "\n",
     "NSYS_HOME = os.path.abspath(\"/home/mclasca/Apps/nsight-system/2024.5.1/\")\n",
     "#NSIGHT_HOME = os.getenv('NSIGHT_HOME')\n",
@@ -27,6 +31,12 @@
     "#REPORT_NAME=\"heka-step53+accum1-profile-2023.4-5721957\"\n",
     "#REPORT_NAME=\"heka-axolotl-Mistral7B0.1-profile-2110598\"\n",
     "\n",
+    "MULTIREPORT = True\n",
+    "if MULTIREPORT:\n",
+    "    REPORTS_LIST = [os.path.abspath(x) for x in [\"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_0.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_1.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_2.nsys-rep\"]]\n",
+    "    REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]\n",
+    "    REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report\n",
+    "\n",
     "locale.setlocale(locale.LC_ALL, '')\n",
     "\n",
     "trace_name = \"test-heka\"\n",
@@ -69,6 +79,7 @@
     "nvtx_stack_bottom = 4\n",
     "\n",
     "reports = [\"cuda_api_trace\", \"cuda_gpu_trace\"]\n",
+    "nsi = NSYSInterface(reports, False, NVTX_RANGE, False)\n",
     "\n",
     "def build_nsys_stats_name(report_name):\n",
     "    base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]\n",
@@ -684,6 +695,43 @@
     "    raise ChildProcessError(p.returncode, p.args)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cuda_api_df = []\n",
+    "if MULTIREPORT:\n",
+    "    for i, REPORT_FILE_I in enumerate(REPORTS_LIST):\n",
+    "        cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], \"cuda_api_trace\")))\n",
+    "else:\n",
+    "    cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, \"cuda_api_trace\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernels_df = []\n",
+    "if MULTIREPORT:\n",
+    "    sum = 0\n",
+    "    for REPORT_FILE_I in REPORTS_LIST:\n",
+    "        ksi = KernelsSemantic(REPORT_FILE_I)\n",
+    "        ksi.Setup()\n",
+    "        ksi.load_data()\n",
+    "        kernels_df.append(ksi.get_df())\n",
+    "        sum += ksi.get_df().shape[0]\n",
+    "        del ksi\n",
+    "else:\n",
+    "    ks = KernelsSemantic(REPORT_FILE)\n",
+    "    ks.Setup()\n",
+    "    ks.load_data()\n",
+    "    kernels_df = ks.get_df()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -711,7 +759,7 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Start (ns)</th>\n",
-       "      <th>Duration (ns)</th>\n",
+       "      <th>Duration:dur_ns</th>\n",
        "      <th>CorrID</th>\n",
        "      <th>GrdX</th>\n",
        "      <th>GrdY</th>\n",
@@ -721,12 +769,12 @@
        "      <th>BlkZ</th>\n",
        "      <th>Reg/Trd</th>\n",
        "      <th>...</th>\n",
-       "      <th>DymSMem (MB)</th>\n",
-       "      <th>Bytes (MB)</th>\n",
-       "      <th>Throughput (MB/s)</th>\n",
+       "      <th>Throughput:thru_B</th>\n",
        "      <th>SrcMemKd</th>\n",
        "      <th>DstMemKd</th>\n",
        "      <th>Device</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th>Pid</th>\n",
        "      <th>Ctx</th>\n",
        "      <th>GreenCtx</th>\n",
        "      <th>Strm</th>\n",
@@ -736,8 +784,8 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1307261431</td>\n",
-       "      <td>992</td>\n",
+       "      <td>1271556323</td>\n",
+       "      <td>960</td>\n",
        "      <td>688</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -747,21 +795,21 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>32,258</td>\n",
+       "      <td>33333312.0</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Host-to-Device]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1662416719</td>\n",
-       "      <td>960</td>\n",
+       "      <td>1626687401</td>\n",
+       "      <td>896</td>\n",
        "      <td>1285</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -771,20 +819,20 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>133,333</td>\n",
+       "      <td>142857088.0</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Host-to-Device]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1662448494</td>\n",
+       "      <td>1626718377</td>\n",
        "      <td>736</td>\n",
        "      <td>1291</td>\n",
        "      <td>NaN</td>\n",
@@ -795,21 +843,21 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>173,913</td>\n",
+       "      <td>173912960.0</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Host-to-Device]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1662469326</td>\n",
-       "      <td>768</td>\n",
+       "      <td>1626741640</td>\n",
+       "      <td>704</td>\n",
        "      <td>1297</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -819,21 +867,21 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>166,667</td>\n",
+       "      <td>181818112.0</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Host-to-Device]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1662702764</td>\n",
-       "      <td>704</td>\n",
+       "      <td>1626968327</td>\n",
+       "      <td>736</td>\n",
        "      <td>1327</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -843,14 +891,14 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>34,091</td>\n",
+       "      <td>38043460.0</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Host-to-Device]</td>\n",
        "    </tr>\n",
@@ -879,10 +927,10 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6645</th>\n",
-       "      <td>11824673327</td>\n",
-       "      <td>1378516</td>\n",
-       "      <td>68613</td>\n",
+       "      <th>7317</th>\n",
+       "      <td>11788939184</td>\n",
+       "      <td>1375193</td>\n",
+       "      <td>78406</td>\n",
        "      <td>65535.0</td>\n",
        "      <td>1.0</td>\n",
        "      <td>1.0</td>\n",
@@ -891,22 +939,22 @@
        "      <td>1.0</td>\n",
        "      <td>36.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>0,001</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>mod_time_ops_adapt_dt_cfl_32_gpu</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6646</th>\n",
-       "      <td>11826052707</td>\n",
-       "      <td>99903</td>\n",
-       "      <td>68614</td>\n",
+       "      <th>7318</th>\n",
+       "      <td>11790315529</td>\n",
+       "      <td>99840</td>\n",
+       "      <td>78407</td>\n",
        "      <td>3.0</td>\n",
        "      <td>1.0</td>\n",
        "      <td>1.0</td>\n",
@@ -915,22 +963,22 @@
        "      <td>1.0</td>\n",
        "      <td>18.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>0,001</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>mod_time_ops_adapt_dt_cfl_32_gpu__red</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6647</th>\n",
-       "      <td>11826167106</td>\n",
-       "      <td>2176</td>\n",
-       "      <td>68616</td>\n",
+       "      <th>7319</th>\n",
+       "      <td>11790426664</td>\n",
+       "      <td>2272</td>\n",
+       "      <td>78409</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -939,22 +987,22 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>1,838</td>\n",
+       "      <td>1760560.0</td>\n",
        "      <td>Device</td>\n",
        "      <td>Pageable</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Device-to-Host]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6648</th>\n",
-       "      <td>11826178754</td>\n",
-       "      <td>2176</td>\n",
-       "      <td>68617</td>\n",
+       "      <th>7320</th>\n",
+       "      <td>11790437737</td>\n",
+       "      <td>2304</td>\n",
+       "      <td>78410</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -963,22 +1011,22 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>1,838</td>\n",
+       "      <td>1736108.0</td>\n",
        "      <td>Device</td>\n",
        "      <td>Pageable</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Device-to-Host]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6649</th>\n",
-       "      <td>11826190114</td>\n",
-       "      <td>2176</td>\n",
-       "      <td>68618</td>\n",
+       "      <th>7321</th>\n",
+       "      <td>11790448584</td>\n",
+       "      <td>2240</td>\n",
+       "      <td>78411</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -987,61 +1035,61 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>1,838</td>\n",
+       "      <td>1785712.0</td>\n",
        "      <td>Device</td>\n",
        "      <td>Pageable</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Device-to-Host]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>6650 rows × 21 columns</p>\n",
+       "<p>7322 rows × 23 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       Start (ns)  Duration (ns)  CorrID     GrdX  GrdY  GrdZ   BlkX  BlkY  \\\n",
-       "0      1307261431            992     688      NaN   NaN   NaN    NaN   NaN   \n",
-       "1      1662416719            960    1285      NaN   NaN   NaN    NaN   NaN   \n",
-       "2      1662448494            736    1291      NaN   NaN   NaN    NaN   NaN   \n",
-       "3      1662469326            768    1297      NaN   NaN   NaN    NaN   NaN   \n",
-       "4      1662702764            704    1327      NaN   NaN   NaN    NaN   NaN   \n",
-       "...           ...            ...     ...      ...   ...   ...    ...   ...   \n",
-       "6645  11824673327        1378516   68613  65535.0   1.0   1.0   32.0   1.0   \n",
-       "6646  11826052707          99903   68614      3.0   1.0   1.0  256.0   1.0   \n",
-       "6647  11826167106           2176   68616      NaN   NaN   NaN    NaN   NaN   \n",
-       "6648  11826178754           2176   68617      NaN   NaN   NaN    NaN   NaN   \n",
-       "6649  11826190114           2176   68618      NaN   NaN   NaN    NaN   NaN   \n",
+       "       Start (ns)  Duration:dur_ns  CorrID     GrdX  GrdY  GrdZ   BlkX  BlkY  \\\n",
+       "0      1271556323              960     688      NaN   NaN   NaN    NaN   NaN   \n",
+       "1      1626687401              896    1285      NaN   NaN   NaN    NaN   NaN   \n",
+       "2      1626718377              736    1291      NaN   NaN   NaN    NaN   NaN   \n",
+       "3      1626741640              704    1297      NaN   NaN   NaN    NaN   NaN   \n",
+       "4      1626968327              736    1327      NaN   NaN   NaN    NaN   NaN   \n",
+       "...           ...              ...     ...      ...   ...   ...    ...   ...   \n",
+       "7317  11788939184          1375193   78406  65535.0   1.0   1.0   32.0   1.0   \n",
+       "7318  11790315529            99840   78407      3.0   1.0   1.0  256.0   1.0   \n",
+       "7319  11790426664             2272   78409      NaN   NaN   NaN    NaN   NaN   \n",
+       "7320  11790437737             2304   78410      NaN   NaN   NaN    NaN   NaN   \n",
+       "7321  11790448584             2240   78411      NaN   NaN   NaN    NaN   NaN   \n",
        "\n",
-       "      BlkZ  Reg/Trd  ... DymSMem (MB) Bytes (MB) Throughput (MB/s)  SrcMemKd  \\\n",
-       "0      NaN      NaN  ...          NaN      0,000            32,258  Pageable   \n",
-       "1      NaN      NaN  ...          NaN      0,000           133,333  Pageable   \n",
-       "2      NaN      NaN  ...          NaN      0,000           173,913  Pageable   \n",
-       "3      NaN      NaN  ...          NaN      0,000           166,667  Pageable   \n",
-       "4      NaN      NaN  ...          NaN      0,000            34,091  Pageable   \n",
-       "...    ...      ...  ...          ...        ...               ...       ...   \n",
-       "6645   1.0     36.0  ...        0,001        NaN               NaN       NaN   \n",
-       "6646   1.0     18.0  ...        0,001        NaN               NaN       NaN   \n",
-       "6647   NaN      NaN  ...          NaN      0,000             1,838    Device   \n",
-       "6648   NaN      NaN  ...          NaN      0,000             1,838    Device   \n",
-       "6649   NaN      NaN  ...          NaN      0,000             1,838    Device   \n",
+       "      BlkZ  Reg/Trd  ...  Throughput:thru_B  SrcMemKd  DstMemKd  \\\n",
+       "0      NaN      NaN  ...         33333312.0  Pageable    Device   \n",
+       "1      NaN      NaN  ...        142857088.0  Pageable    Device   \n",
+       "2      NaN      NaN  ...        173912960.0  Pageable    Device   \n",
+       "3      NaN      NaN  ...        181818112.0  Pageable    Device   \n",
+       "4      NaN      NaN  ...         38043460.0  Pageable    Device   \n",
+       "...    ...      ...  ...                ...       ...       ...   \n",
+       "7317   1.0     36.0  ...                NaN      None      None   \n",
+       "7318   1.0     18.0  ...                NaN      None      None   \n",
+       "7319   NaN      NaN  ...          1760560.0    Device  Pageable   \n",
+       "7320   NaN      NaN  ...          1736108.0    Device  Pageable   \n",
+       "7321   NaN      NaN  ...          1785712.0    Device  Pageable   \n",
        "\n",
-       "      DstMemKd           Device Ctx  GreenCtx  Strm  \\\n",
-       "0       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "1       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "2       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "3       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "4       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "...        ...              ...  ..       ...   ...   \n",
-       "6645       NaN  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6646       NaN  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6647  Pageable  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6648  Pageable  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6649  Pageable  NVIDIA H100 (1)   1       NaN    16   \n",
+       "               Device deviceid      Pid Ctx  GreenCtx  Strm  \\\n",
+       "0     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "1     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "2     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "3     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "4     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "...               ...      ...      ...  ..       ...   ...   \n",
+       "7317  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "7318  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "7319  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "7320  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
+       "7321  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
        "\n",
        "                                       Name  \n",
        "0              [CUDA memcpy Host-to-Device]  \n",
@@ -1050,13 +1098,13 @@
        "3              [CUDA memcpy Host-to-Device]  \n",
        "4              [CUDA memcpy Host-to-Device]  \n",
        "...                                     ...  \n",
-       "6645       mod_time_ops_adapt_dt_cfl_32_gpu  \n",
-       "6646  mod_time_ops_adapt_dt_cfl_32_gpu__red  \n",
-       "6647           [CUDA memcpy Device-to-Host]  \n",
-       "6648           [CUDA memcpy Device-to-Host]  \n",
-       "6649           [CUDA memcpy Device-to-Host]  \n",
+       "7317       mod_time_ops_adapt_dt_cfl_32_gpu  \n",
+       "7318  mod_time_ops_adapt_dt_cfl_32_gpu__red  \n",
+       "7319           [CUDA memcpy Device-to-Host]  \n",
+       "7320           [CUDA memcpy Device-to-Host]  \n",
+       "7321           [CUDA memcpy Device-to-Host]  \n",
        "\n",
-       "[6650 rows x 21 columns]"
+       "[7322 rows x 23 columns]"
       ]
      },
      "execution_count": 8,
@@ -1065,9 +1113,7 @@
     }
    ],
    "source": [
-    "kernels_df = pd.read_csv(build_nsys_stats_name(\"cuda_gpu_trace\"))\n",
-    "kernels_df.rename(columns={\"CorrId\": \"CorrID\"}, inplace=True)\n",
-    "kernels_df"
+    "kernels_df[2]"
    ]
   },
   {
-- 
GitLab


From 59ea3ffdb70d7b58ed7f8ed532c0e93b231e1341 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Fri, 27 Sep 2024 14:26:13 +0200
Subject: [PATCH 5/9] Adds OpenACC to multireport construction

---
 nsys2prv/parse_nsys_stats.py           |   56 +-
 nsys2prv/semantics/__init__.py         |    3 +-
 nsys2prv/semantics/openacc_semantic.py |   27 +
 parser-playground.ipynb                | 1368 ++++++++++++++++++------
 4 files changed, 1116 insertions(+), 338 deletions(-)
 create mode 100644 nsys2prv/semantics/openacc_semantic.py

diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 0490046..80f763b 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -299,14 +299,39 @@ def main():
             del ks
 
     if t_openacc:
-        with engine.connect() as conn, conn.begin():
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_other.sql'), 'r') as query:
-                openacc_other_df = pd.read_sql_query(text(query.read()), conn)
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_launch.sql'), 'r') as query:
-                openacc_launch_df = pd.read_sql_query(text(query.read()), conn)
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_data.sql'), 'r') as query:
-                openacc_data_df = pd.read_sql_query(text(query.read()), conn)
-            openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn)
+        if MULTIREPORT:
+            openacc_other_df = []
+            openacc_launch_df = []
+            openacc_data_df = []
+            for REPORT_FILE_I in REPORTS_LIST:
+                ksio = OpenACCOtherSemantic(REPORT_FILE_I)
+                ksio.Setup()
+                ksio.load_data()
+                openacc_other_df.append(ksio.get_df())
+                ksil = OpenACCLaunchSemantic(REPORT_FILE_I)
+                ksil.Setup()
+                ksil.load_data()
+                openacc_launch_df.append(ksil.get_df())
+                ksid = OpenACCDataSemantic(REPORT_FILE_I)
+                ksid.Setup()
+                ksid.load_data()
+                openacc_data_df.append(ksid.get_df())
+                del ksio, ksil, ksid
+        else:
+            kso = OpenACCOtherSemantic(REPORT_FILE_I)
+            kso.Setup()
+            kso.load_data()
+            openacc_other_df = kso.get_df()
+            ksl = OpenACCLaunchSemantic(REPORT_FILE_I)
+            ksl.Setup()
+            ksl.load_data()
+            openacc_launch_df = ksl.get_df()
+            ksd = OpenACCDataSemantic(REPORT_FILE_I)
+            ksd.Setup()
+            ksd.load_data()
+            openacc_data_df = ksd.get_df()
+            del kso, ksl, ksd
+        openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", f"sqlite:///{os.path.splitext(REPORTS_LIST[0])[0]}.sqlite")
 
 
     # MARK: CONTEXT INFO
@@ -374,9 +399,22 @@ def main():
                 df['End:ts_ns'] += deltas[i]
             mpi_df = pd.concat(mpi_df, ignore_index=True)
         
+        if t_openacc:
+            for i, df in enumerate(openacc_other_df):
+                df['start'] += deltas[i]
+                df['end'] += deltas[i]
+            for i, df in enumerate(openacc_launch_df):
+                df['start'] += deltas[i]
+                df['end'] += deltas[i]
+            for i, df in enumerate(openacc_data_df):
+                df['start'] += deltas[i]
+                df['end'] += deltas[i]
+            openacc_other_df = pd.concat(openacc_other_df, ignore_index=True)
+            openacc_launch_df = pd.concat(openacc_launch_df, ignore_index=True)
+            openacc_data_df = pd.concat(openacc_data_df, ignore_index=True)
+        
         #if t_metrics:
 
-        #if t_openacc:
         
 
     # MARK: PROCESS MODEL
diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py
index 460fd05..1976280 100644
--- a/nsys2prv/semantics/__init__.py
+++ b/nsys2prv/semantics/__init__.py
@@ -1,4 +1,5 @@
 from .kernels_semantic import KernelsSemantic
 from .mpi_semantic import *
 from .nvtx_startend_semantic import NVTXStartEndSemantic
-from .gpu_metrics_semantic import GPUMetricsSemantic
\ No newline at end of file
+from .gpu_metrics_semantic import GPUMetricsSemantic
+from .openacc_semantic import *
\ No newline at end of file
diff --git a/nsys2prv/semantics/openacc_semantic.py b/nsys2prv/semantics/openacc_semantic.py
new file mode 100644
index 0000000..1059719
--- /dev/null
+++ b/nsys2prv/semantics/openacc_semantic.py
@@ -0,0 +1,27 @@
+from .nsys_event import NsysEvent
+import os.path
+from sqlalchemy import text
+
+class OpenACCOtherSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_other.sql'), 'r') as query:
+            self.query = text(query.read())
+    
+class OpenACCLaunchSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_launch.sql'), 'r') as query:
+            self.query = text(query.read())
+
+class OpenACCDataSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_data.sql'), 'r') as query:
+            self.query = text(query.read())
\ No newline at end of file
diff --git a/parser-playground.ipynb b/parser-playground.ipynb
index cb91839..c4b2ad5 100644
--- a/parser-playground.ipynb
+++ b/parser-playground.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -697,10 +697,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_contexts = []\n",
+    "if MULTIREPORT:\n",
+    "    for REPORT_FILE_I in REPORTS_LIST:\n",
+    "        context_info_i = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\")\n",
+    "        list_contexts.append(context_info_i)\n",
+    "    context_info = pd.concat(list_contexts)\n",
+    "else:\n",
+    "    context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n",
+    "context_info.sort_values([\"processId\"], inplace=True)\n",
+    "\n",
+    "# CONTEXT INFO CHECK FOR MULTIREPORT\n",
+    "if context_info[\"deviceId\"].unique().size == 0:\n",
+    "    print(f\"\\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \\033[00m\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
+    "t_apicalls = True\n",
     "cuda_api_df = []\n",
     "if MULTIREPORT:\n",
     "    for i, REPORT_FILE_I in enumerate(REPORTS_LIST):\n",
@@ -734,7 +756,66 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "After concat: (21299, 23)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from functools import reduce\n",
+    "# MARK: MERGING AND ALIGNING\n",
+    "if MULTIREPORT:\n",
+    "    # Find delta between earliest trace start and the others\n",
+    "    session_time = []\n",
+    "    for REPORT_FILE_I in REPORTS_LIST:\n",
+    "        session_time.append(pd.read_sql_table(\"TARGET_INFO_SESSION_START_TIME\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\"))\n",
+    "    \n",
+    "    session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs\n",
+    "    earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf'))\n",
+    "    deltas = [start - earliest_time for start in session_time]\n",
+    "    for i, df in enumerate(kernels_df):\n",
+    "        df['Start (ns)'] += deltas[i]\n",
+    "    kernels_df = pd.concat(kernels_df)\n",
+    "    print(f\"After concat: {kernels_df.shape}\")\n",
+    "\n",
+    "    if t_apicalls:\n",
+    "        for i, df in enumerate(cuda_api_df):\n",
+    "            df['Start (ns)'] += deltas[i]\n",
+    "\n",
+    "        cuda_api_df = pd.concat(cuda_api_df)\n",
+    "\n",
+    "    # if t_nvtx:\n",
+    "    #     for i, df in enumerate(nvtx_df):\n",
+    "    #         df['Start (ns)'] += deltas[i]\n",
+    "    #         df['End (ns)'] += deltas[i]\n",
+    "    #     nvtx_df = pd.concat(nvtx_df)\n",
+    "\n",
+    "    # if t_nvtx_startend:\n",
+    "    #     for i, df in enumerate(nvtx_startend_df):\n",
+    "    #         df['Start (ns)'] += deltas[i]\n",
+    "    #         df['End (ns)'] += deltas[i]\n",
+    "    #     nvtx_startend_df = pd.concat(nvtx_startend_df)\n",
+    "\n",
+    "    # if t_mpi:\n",
+    "    #     for i, df in enumerate(mpi_df):\n",
+    "    #         df['Start:ts_ns'] += deltas[i]\n",
+    "    #         df['End:ts_ns'] += deltas[i]\n",
+    "    #     mpi_df = pd.concat(mpi_df)\n",
+    "    \n",
+    "    #if t_metrics:\n",
+    "\n",
+    "    #if t_openacc:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -758,227 +839,628 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>Start (ns)</th>\n",
-       "      <th>Duration:dur_ns</th>\n",
-       "      <th>CorrID</th>\n",
-       "      <th>GrdX</th>\n",
-       "      <th>GrdY</th>\n",
-       "      <th>GrdZ</th>\n",
-       "      <th>BlkX</th>\n",
-       "      <th>BlkY</th>\n",
-       "      <th>BlkZ</th>\n",
-       "      <th>Reg/Trd</th>\n",
-       "      <th>...</th>\n",
-       "      <th>Throughput:thru_B</th>\n",
-       "      <th>SrcMemKd</th>\n",
-       "      <th>DstMemKd</th>\n",
-       "      <th>Device</th>\n",
-       "      <th>deviceid</th>\n",
        "      <th>Pid</th>\n",
-       "      <th>Ctx</th>\n",
-       "      <th>GreenCtx</th>\n",
-       "      <th>Strm</th>\n",
-       "      <th>Name</th>\n",
+       "      <th>Tid</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>device</th>\n",
        "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1271556323</td>\n",
-       "      <td>960</td>\n",
-       "      <td>688</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>33333312.0</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (2)</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2308062</td>\n",
-       "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "      <th>task</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
        "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1626687401</td>\n",
-       "      <td>896</td>\n",
-       "      <td>1285</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>142857088.0</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (2)</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2308062</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>{2308061}</td>\n",
        "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1626718377</td>\n",
-       "      <td>736</td>\n",
-       "      <td>1291</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>173912960.0</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (2)</td>\n",
-       "      <td>2</td>\n",
        "      <td>2308062</td>\n",
+       "      <td>{2308062}</td>\n",
        "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1626741640</td>\n",
-       "      <td>704</td>\n",
-       "      <td>1297</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>181818112.0</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (2)</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2308062</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>{2308065}</td>\n",
        "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1626968327</td>\n",
-       "      <td>736</td>\n",
-       "      <td>1327</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>38043460.0</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (2)</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Pid        Tid  thread  device\n",
+       "task                                    \n",
+       "1     2308061  {2308061}       1       0\n",
+       "2     2308062  {2308062}       1       2\n",
+       "3     2308065  {2308065}       1       1"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_threads_with = []\n",
+    "compute_threads_with.append(cuda_api_df[['Pid', 'Tid']])\n",
+    "# if t_nvtx: compute_threads_with.append(nvtx_df[[\"Pid\", \"Tid\"]])\n",
+    "# if t_nvtx_startend: compute_threads_with.append(nvtx_startend_df[[\"Pid\", \"Tid\"]])\n",
+    "# if t_mpi: compute_threads_with.append(mpi_df[[\"Pid\", \"Tid\"]])\n",
+    "# if t_openacc: compute_threads_with.append(openacc_other_df[[\"Pid\", \"Tid\"]])\n",
+    "\n",
+    "t_mpi = False\n",
+    "threads = pd.concat(compute_threads_with).drop_duplicates()\n",
+    "if t_mpi:\n",
+    "    threads[\"Rank\"] = threads[\"Pid\"].map(rank_info.set_index(\"Pid\")[\"rank\"])\n",
+    "    threads.sort_values([\"Rank\"], inplace=True)\n",
+    "else:\n",
+    "    threads.sort_values([\"Pid\"], inplace=True)\n",
+    "threads[\"thread\"] = threads.groupby([\"Pid\"]).cumcount() + 1\n",
+    "threads[\"task\"] = threads.groupby([\"Pid\"]).ngroup() + 1\n",
+    "threads[\"device\"] = threads[\"Pid\"].map(context_info[context_info[\"contextId\"] == 1].set_index(\"processId\")[\"deviceId\"])\n",
+    "#threads.sort_values([\"task\", \"thread\"], inplace=True)\n",
+    "threads.reset_index()\n",
+    "\n",
+    "tasks_set = threads.groupby([\"task\"]).agg({'Pid': 'first',\n",
+    "                                    'Tid': lambda x: set(x),\n",
+    "                                        'thread': 'count',\n",
+    "                                        'device': 'first' })\n",
+    "tasks_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Device</th>\n",
+       "      <th>Strm</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>task</th>\n",
+       "      <th>Pid</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
+       "      <th>Pid</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
        "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>7317</th>\n",
-       "      <td>11788939184</td>\n",
-       "      <td>1375193</td>\n",
-       "      <td>78406</td>\n",
-       "      <td>65535.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>32.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>36.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>NVIDIA H100 (2)</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2308062</td>\n",
+       "      <th>2308061</th>\n",
+       "      <th>0</th>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>{16, 17, 18, 19}</td>\n",
+       "      <td>4</td>\n",
        "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>mod_time_ops_adapt_dt_cfl_32_gpu</td>\n",
+       "      <td>2308061</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7318</th>\n",
-       "      <td>11790315529</td>\n",
-       "      <td>99840</td>\n",
-       "      <td>78407</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>256.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>18.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
+       "      <th>2308062</th>\n",
+       "      <th>2</th>\n",
        "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>{16, 17, 18, 19}</td>\n",
+       "      <td>4</td>\n",
        "      <td>2</td>\n",
        "      <td>2308062</td>\n",
-       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2308065</th>\n",
+       "      <th>1</th>\n",
+       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>{16, 17, 18}</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2308065</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                           Device              Strm  thread  task      Pid\n",
+       "Pid     deviceid                                                          \n",
+       "2308061 0         NVIDIA H100 (0)  {16, 17, 18, 19}       4     1  2308061\n",
+       "2308062 2         NVIDIA H100 (2)  {16, 17, 18, 19}       4     2  2308062\n",
+       "2308065 1         NVIDIA H100 (1)      {16, 17, 18}       3     3  2308065"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates()\n",
+    "streams[\"thread\"] = streams.groupby([\"Pid\", \"Device\"]).cumcount() + 1\n",
+    "#streams[\"deviceid\"] = streams.sort_values(\"Device\").groupby([\"Device\"]).ngroup()\n",
+    "#streams[\"Pid\"] = streams[\"deviceid\"].map(tasks_set.set_index(\"device\")[\"Pid\"])\n",
+    "streams[\"task\"] = streams[\"Pid\"].map(tasks_set.reset_index().set_index(\"Pid\")[\"task\"])\n",
+    "\n",
+    "streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str)\n",
+    "num_streams = streams.count().iloc[0]\n",
+    "streams.sort_values([\"Pid\", \"thread\"], inplace=True)\n",
+    "streams.reset_index(inplace=True)\n",
+    "\n",
+    "devices_set = streams.groupby([\"Pid\", \"deviceid\"]).agg({'Device': 'first',\n",
+    "                                    'Strm': lambda x: set(x),\n",
+    "                                        'thread': 'count',\n",
+    "                                        'task': 'first',\n",
+    "                                        'Pid': 'last'})\n",
+    "devices_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Device</th>\n",
+       "      <th>Strm</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th>Pid</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>task</th>\n",
+       "      <th>row_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>16</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>190</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>17</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>191</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>204</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>19</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>16</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>190</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>17</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>191</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>18</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>203</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>19</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>5</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>0</td>\n",
+       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>16</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>CUDA-D1.S16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>190</td>\n",
+       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>17</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>CUDA-D1.S17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>191</td>\n",
+       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>18</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>4</td>\n",
+       "      <td>3</td>\n",
+       "      <td>CUDA-D1.S18</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    index           Device  Strm  deviceid      Pid  thread  task     row_name\n",
+       "0       0  NVIDIA H100 (0)    16         0  2308061       2     1  CUDA-D0.S16\n",
+       "1     190  NVIDIA H100 (0)    17         0  2308061       3     1  CUDA-D0.S17\n",
+       "2     191  NVIDIA H100 (0)    18         0  2308061       4     1  CUDA-D0.S18\n",
+       "3     204  NVIDIA H100 (0)    19         0  2308061       5     1  CUDA-D0.S19\n",
+       "4       0  NVIDIA H100 (2)    16         2  2308062       2     2  CUDA-D2.S16\n",
+       "5     190  NVIDIA H100 (2)    17         2  2308062       3     2  CUDA-D2.S17\n",
+       "6     191  NVIDIA H100 (2)    18         2  2308062       4     2  CUDA-D2.S18\n",
+       "7     203  NVIDIA H100 (2)    19         2  2308062       5     2  CUDA-D2.S19\n",
+       "8       0  NVIDIA H100 (1)    16         1  2308065       2     3  CUDA-D1.S16\n",
+       "9     190  NVIDIA H100 (1)    17         1  2308065       3     3  CUDA-D1.S17\n",
+       "10    191  NVIDIA H100 (1)    18         1  2308065       4     3  CUDA-D1.S18"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_normal_threads = tasks_set['thread']\n",
+    "num_normal_threads_repeated = num_normal_threads.repeat(devices_set[\"thread\"]).reset_index()[[\"thread\"]]\n",
+    "streams['thread'] = streams['thread'] + num_normal_threads_repeated[\"thread\"]\n",
+    "streams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Shape of original kernels_df: (21299, 25). Shape of result_df: (21299, 27).\n"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'thread'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
+      "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'thread'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[19], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of original kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Shape of result_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# Copy the results back to kernels_df\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mthread\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mresult_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mthread\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m      8\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m result_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of new kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   4101\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m   4104\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
+      "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3807\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3808\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3809\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3810\u001b[0m     ):\n\u001b[1;32m   3811\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3814\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3815\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3816\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3817\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'thread'"
+     ]
+    }
+   ],
+   "source": [
+    "filtered_streams = streams.groupby([\"Pid\", \"Strm\"]).agg({'thread':'first', 'task':'first'}).reset_index()\n",
+    "# Now, merge the filtered streams DataFrame with kernels_df\n",
+    "result_df = kernels_df.merge(filtered_streams, how='left', on=[\"Pid\", 'Strm'])\n",
+    "print(f\"Shape of original kernels_df: {kernels_df.shape}. Shape of result_df: {result_df.shape}.\")\n",
+    "\n",
+    "# Copy the results back to kernels_df\n",
+    "kernels_df['thread'] = result_df['thread'].to_numpy()\n",
+    "kernels_df['task'] = result_df['task'].to_numpy()\n",
+    "print(f\"Shape of new kernels_df: {kernels_df.shape}.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Start (ns)</th>\n",
+       "      <th>Duration:dur_ns</th>\n",
+       "      <th>CorrID</th>\n",
+       "      <th>GrdX</th>\n",
+       "      <th>GrdY</th>\n",
+       "      <th>GrdZ</th>\n",
+       "      <th>BlkX</th>\n",
+       "      <th>BlkY</th>\n",
+       "      <th>BlkZ</th>\n",
+       "      <th>Reg/Trd</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DstMemKd</th>\n",
+       "      <th>Device</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th>Pid</th>\n",
+       "      <th>Ctx</th>\n",
+       "      <th>GreenCtx</th>\n",
+       "      <th>Strm</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>task</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2023</th>\n",
+       "      <td>10256971039</td>\n",
+       "      <td>2272</td>\n",
+       "      <td>23576</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Pageable</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2024</th>\n",
+       "      <td>10257043551</td>\n",
+       "      <td>1105690</td>\n",
+       "      <td>23590</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_245_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025</th>\n",
+       "      <td>10258164089</td>\n",
+       "      <td>51168</td>\n",
+       "      <td>23595</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_bc_routines_bc_fix_dirichlet_residual_293_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2026</th>\n",
+       "      <td>10258227161</td>\n",
+       "      <td>768</td>\n",
+       "      <td>23598</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memset]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2027</th>\n",
+       "      <td>10258236281</td>\n",
+       "      <td>897563</td>\n",
+       "      <td>23601</td>\n",
+       "      <td>65535.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_271_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2028</th>\n",
+       "      <td>10259135028</td>\n",
+       "      <td>97632</td>\n",
+       "      <td>23602</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>256.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
        "      <td>None</td>\n",
        "      <td>16</td>\n",
-       "      <td>mod_time_ops_adapt_dt_cfl_32_gpu__red</td>\n",
+       "      <td>mod_solver_conjgrad_imex_271_gpu__red</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7319</th>\n",
-       "      <td>11790426664</td>\n",
-       "      <td>2272</td>\n",
-       "      <td>78409</td>\n",
+       "      <th>2029</th>\n",
+       "      <td>10259241107</td>\n",
+       "      <td>2273</td>\n",
+       "      <td>23604</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -987,8 +1469,6 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>1760560.0</td>\n",
-       "      <td>Device</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>NVIDIA H100 (2)</td>\n",
        "      <td>2</td>\n",
@@ -997,12 +1477,14 @@
        "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7320</th>\n",
-       "      <td>11790437737</td>\n",
-       "      <td>2304</td>\n",
-       "      <td>78410</td>\n",
+       "      <th>2030</th>\n",
+       "      <td>10259285939</td>\n",
+       "      <td>768</td>\n",
+       "      <td>23616</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1011,22 +1493,70 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>1736108.0</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>Pageable</td>\n",
+       "      <td>None</td>\n",
        "      <td>NVIDIA H100 (2)</td>\n",
        "      <td>2</td>\n",
        "      <td>2308062</td>\n",
        "      <td>1</td>\n",
        "      <td>None</td>\n",
        "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>[CUDA memset]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2031</th>\n",
+       "      <td>10259294836</td>\n",
+       "      <td>412701</td>\n",
+       "      <td>23619</td>\n",
+       "      <td>65535.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_297_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2032</th>\n",
+       "      <td>10259708465</td>\n",
+       "      <td>96064</td>\n",
+       "      <td>23620</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>256.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_297_gpu__red</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7321</th>\n",
-       "      <td>11790448584</td>\n",
-       "      <td>2240</td>\n",
-       "      <td>78411</td>\n",
+       "      <th>2033</th>\n",
+       "      <td>10259813009</td>\n",
+       "      <td>2272</td>\n",
+       "      <td>23622</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1035,8 +1565,6 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>...</td>\n",
-       "      <td>1785712.0</td>\n",
-       "      <td>Device</td>\n",
        "      <td>Pageable</td>\n",
        "      <td>NVIDIA H100 (2)</td>\n",
        "      <td>2</td>\n",
@@ -1045,145 +1573,329 @@
        "      <td>None</td>\n",
        "      <td>16</td>\n",
        "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>7322 rows × 23 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       Start (ns)  Duration:dur_ns  CorrID     GrdX  GrdY  GrdZ   BlkX  BlkY  \\\n",
-       "0      1271556323              960     688      NaN   NaN   NaN    NaN   NaN   \n",
-       "1      1626687401              896    1285      NaN   NaN   NaN    NaN   NaN   \n",
-       "2      1626718377              736    1291      NaN   NaN   NaN    NaN   NaN   \n",
-       "3      1626741640              704    1297      NaN   NaN   NaN    NaN   NaN   \n",
-       "4      1626968327              736    1327      NaN   NaN   NaN    NaN   NaN   \n",
-       "...           ...              ...     ...      ...   ...   ...    ...   ...   \n",
-       "7317  11788939184          1375193   78406  65535.0   1.0   1.0   32.0   1.0   \n",
-       "7318  11790315529            99840   78407      3.0   1.0   1.0  256.0   1.0   \n",
-       "7319  11790426664             2272   78409      NaN   NaN   NaN    NaN   NaN   \n",
-       "7320  11790437737             2304   78410      NaN   NaN   NaN    NaN   NaN   \n",
-       "7321  11790448584             2240   78411      NaN   NaN   NaN    NaN   NaN   \n",
-       "\n",
-       "      BlkZ  Reg/Trd  ...  Throughput:thru_B  SrcMemKd  DstMemKd  \\\n",
-       "0      NaN      NaN  ...         33333312.0  Pageable    Device   \n",
-       "1      NaN      NaN  ...        142857088.0  Pageable    Device   \n",
-       "2      NaN      NaN  ...        173912960.0  Pageable    Device   \n",
-       "3      NaN      NaN  ...        181818112.0  Pageable    Device   \n",
-       "4      NaN      NaN  ...         38043460.0  Pageable    Device   \n",
-       "...    ...      ...  ...                ...       ...       ...   \n",
-       "7317   1.0     36.0  ...                NaN      None      None   \n",
-       "7318   1.0     18.0  ...                NaN      None      None   \n",
-       "7319   NaN      NaN  ...          1760560.0    Device  Pageable   \n",
-       "7320   NaN      NaN  ...          1736108.0    Device  Pageable   \n",
-       "7321   NaN      NaN  ...          1785712.0    Device  Pageable   \n",
-       "\n",
-       "               Device deviceid      Pid Ctx  GreenCtx  Strm  \\\n",
-       "0     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "1     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "2     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "3     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "4     NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "...               ...      ...      ...  ..       ...   ...   \n",
-       "7317  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "7318  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "7319  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "7320  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "7321  NVIDIA H100 (2)        2  2308062   1      None    16   \n",
-       "\n",
-       "                                       Name  \n",
-       "0              [CUDA memcpy Host-to-Device]  \n",
-       "1              [CUDA memcpy Host-to-Device]  \n",
-       "2              [CUDA memcpy Host-to-Device]  \n",
-       "3              [CUDA memcpy Host-to-Device]  \n",
-       "4              [CUDA memcpy Host-to-Device]  \n",
-       "...                                     ...  \n",
-       "7317       mod_time_ops_adapt_dt_cfl_32_gpu  \n",
-       "7318  mod_time_ops_adapt_dt_cfl_32_gpu__red  \n",
-       "7319           [CUDA memcpy Device-to-Host]  \n",
-       "7320           [CUDA memcpy Device-to-Host]  \n",
-       "7321           [CUDA memcpy Device-to-Host]  \n",
-       "\n",
-       "[7322 rows x 23 columns]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "kernels_df[2]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>nullStreamId</th>\n",
-       "      <th>hwId</th>\n",
-       "      <th>vmId</th>\n",
-       "      <th>processId</th>\n",
-       "      <th>deviceId</th>\n",
-       "      <th>contextId</th>\n",
-       "      <th>parentContextId</th>\n",
-       "      <th>isGreenContext</th>\n",
+       "    <tr>\n",
+       "      <th>2034</th>\n",
+       "      <td>10259840881</td>\n",
+       "      <td>751484</td>\n",
+       "      <td>23634</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>45.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_307_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>7</td>\n",
-       "      <td>0</td>\n",
+       "      <th>2035</th>\n",
+       "      <td>10260609197</td>\n",
+       "      <td>149279</td>\n",
+       "      <td>23640</td>\n",
+       "      <td>245704.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>2308065</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>elem_diffu_full_diffusion_ijk_51_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2036</th>\n",
+       "      <td>10260759628</td>\n",
+       "      <td>55584</td>\n",
+       "      <td>23642</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>elem_diffu_full_diffusion_ijk_52_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2037</th>\n",
+       "      <td>10260829643</td>\n",
+       "      <td>7861560</td>\n",
+       "      <td>23646</td>\n",
+       "      <td>1246149.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>96.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>elem_diffu_full_diffusion_ijk_60_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2038</th>\n",
+       "      <td>10268707939</td>\n",
+       "      <td>8192</td>\n",
+       "      <td>23651</td>\n",
+       "      <td>3329.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_comms_fill_sendbuffer_real_239_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2039</th>\n",
+       "      <td>10268727331</td>\n",
+       "      <td>3392</td>\n",
+       "      <td>23655</td>\n",
+       "      <td>3329.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_comms_fill_sendbuffer_real_246_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2040</th>\n",
+       "      <td>10268754723</td>\n",
+       "      <td>4672</td>\n",
+       "      <td>23663</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Device</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[CUDA memcpy Peer-to-Peer]</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2041</th>\n",
+       "      <td>10268759875</td>\n",
+       "      <td>3328</td>\n",
+       "      <td>23669</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Device</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>19</td>\n",
+       "      <td>[CUDA memcpy Peer-to-Peer]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2042</th>\n",
+       "      <td>10268769987</td>\n",
+       "      <td>5568</td>\n",
+       "      <td>23688</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Device</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>18</td>\n",
+       "      <td>[CUDA memcpy Peer-to-Peer]</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>20 rows × 25 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "   nullStreamId  hwId  vmId  processId  deviceId  contextId  parentContextId  \\\n",
-       "0             7     0     1    2308065         1          1                0   \n",
+       "       Start (ns)  Duration:dur_ns  CorrID       GrdX  GrdY  GrdZ   BlkX  \\\n",
+       "2023  10256971039             2272   23576        NaN   NaN   NaN    NaN   \n",
+       "2024  10257043551          1105690   23590    81902.0   1.0   1.0  128.0   \n",
+       "2025  10258164089            51168   23595    81902.0   1.0   1.0  128.0   \n",
+       "2026  10258227161              768   23598        NaN   NaN   NaN    NaN   \n",
+       "2027  10258236281           897563   23601    65535.0   1.0   1.0  128.0   \n",
+       "2028  10259135028            97632   23602        1.0   1.0   1.0  256.0   \n",
+       "2029  10259241107             2273   23604        NaN   NaN   NaN    NaN   \n",
+       "2030  10259285939              768   23616        NaN   NaN   NaN    NaN   \n",
+       "2031  10259294836           412701   23619    65535.0   1.0   1.0  128.0   \n",
+       "2032  10259708465            96064   23620        1.0   1.0   1.0  256.0   \n",
+       "2033  10259813009             2272   23622        NaN   NaN   NaN    NaN   \n",
+       "2034  10259840881           751484   23634    81902.0   1.0   1.0  128.0   \n",
+       "2035  10260609197           149279   23640   245704.0   1.0   1.0  128.0   \n",
+       "2036  10260759628            55584   23642    81902.0   1.0   1.0  128.0   \n",
+       "2037  10260829643          7861560   23646  1246149.0   1.0   1.0   96.0   \n",
+       "2038  10268707939             8192   23651     3329.0   1.0   1.0  128.0   \n",
+       "2039  10268727331             3392   23655     3329.0   1.0   1.0  128.0   \n",
+       "2040  10268754723             4672   23663        NaN   NaN   NaN    NaN   \n",
+       "2041  10268759875             3328   23669        NaN   NaN   NaN    NaN   \n",
+       "2042  10268769987             5568   23688        NaN   NaN   NaN    NaN   \n",
+       "\n",
+       "      BlkY  BlkZ  Reg/Trd  ...  DstMemKd           Device  deviceid      Pid  \\\n",
+       "2023   NaN   NaN      NaN  ...  Pageable  NVIDIA H100 (2)         2  2308062   \n",
+       "2024   1.0   1.0     32.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2025   1.0   1.0     26.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2026   NaN   NaN      NaN  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2027   1.0   1.0     56.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2028   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2029   NaN   NaN      NaN  ...  Pageable  NVIDIA H100 (2)         2  2308062   \n",
+       "2030   NaN   NaN      NaN  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2031   1.0   1.0     40.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2032   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2033   NaN   NaN      NaN  ...  Pageable  NVIDIA H100 (2)         2  2308062   \n",
+       "2034   1.0   1.0     45.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2035   1.0   1.0     36.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2036   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2037   1.0   1.0     93.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2038   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2039   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2040   NaN   NaN      NaN  ...    Device  NVIDIA H100 (2)         2  2308062   \n",
+       "2041   NaN   NaN      NaN  ...    Device  NVIDIA H100 (2)         2  2308062   \n",
+       "2042   NaN   NaN      NaN  ...    Device  NVIDIA H100 (2)         2  2308062   \n",
+       "\n",
+       "     Ctx GreenCtx Strm                                               Name  \\\n",
+       "2023   1     None   16                       [CUDA memcpy Device-to-Host]   \n",
+       "2024   1     None   16                   mod_solver_conjgrad_imex_245_gpu   \n",
+       "2025   1     None   16  mod_bc_routines_bc_fix_dirichlet_residual_293_gpu   \n",
+       "2026   1     None   16                                      [CUDA memset]   \n",
+       "2027   1     None   16                   mod_solver_conjgrad_imex_271_gpu   \n",
+       "2028   1     None   16              mod_solver_conjgrad_imex_271_gpu__red   \n",
+       "2029   1     None   16                       [CUDA memcpy Device-to-Host]   \n",
+       "2030   1     None   16                                      [CUDA memset]   \n",
+       "2031   1     None   16                   mod_solver_conjgrad_imex_297_gpu   \n",
+       "2032   1     None   16              mod_solver_conjgrad_imex_297_gpu__red   \n",
+       "2033   1     None   16                       [CUDA memcpy Device-to-Host]   \n",
+       "2034   1     None   16                   mod_solver_conjgrad_imex_307_gpu   \n",
+       "2035   1     None   16               elem_diffu_full_diffusion_ijk_51_gpu   \n",
+       "2036   1     None   16               elem_diffu_full_diffusion_ijk_52_gpu   \n",
+       "2037   1     None   16               elem_diffu_full_diffusion_ijk_60_gpu   \n",
+       "2038   1     None   16             mod_comms_fill_sendbuffer_real_239_gpu   \n",
+       "2039   1     None   16             mod_comms_fill_sendbuffer_real_246_gpu   \n",
+       "2040   1     None   17                         [CUDA memcpy Peer-to-Peer]   \n",
+       "2041   1     None   19                         [CUDA memcpy Peer-to-Peer]   \n",
+       "2042   1     None   18                         [CUDA memcpy Peer-to-Peer]   \n",
+       "\n",
+       "      thread  task  \n",
+       "2023       2     2  \n",
+       "2024       2     2  \n",
+       "2025       2     2  \n",
+       "2026       2     2  \n",
+       "2027       2     2  \n",
+       "2028       2     2  \n",
+       "2029       2     2  \n",
+       "2030       2     2  \n",
+       "2031       2     2  \n",
+       "2032       2     2  \n",
+       "2033       2     2  \n",
+       "2034       2     2  \n",
+       "2035       2     2  \n",
+       "2036       2     2  \n",
+       "2037       2     2  \n",
+       "2038       2     2  \n",
+       "2039       2     2  \n",
+       "2040       3     2  \n",
+       "2041       5     2  \n",
+       "2042       4     2  \n",
        "\n",
-       "   isGreenContext  \n",
-       "0               0  "
+       "[20 rows x 25 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n",
-    "context_info"
+    "kernels_df.iloc[16000:16020]"
    ]
   },
   {
-- 
GitLab


From ac26f72a3ef001bc6e538b45e490b23a467292f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Mon, 7 Oct 2024 19:20:18 +0200
Subject: [PATCH 6/9] Changes for dev prerelease

---
 nsys2prv/parse_nsys_stats.py | 1 +
 pyproject.toml               | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 80f763b..3fc1386 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -285,6 +285,7 @@ def main():
     gpu_metrics_agg = []
     if t_metrics:
         if MULTIREPORT:
+            raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!")
             for REPORT_FILE_I in REPORTS_LIST:
                 ksi = GPUMetricsSemantic(REPORT_FILE_I)
                 ksi.Setup()
diff --git a/pyproject.toml b/pyproject.toml
index 5b3f160..d0827c3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,13 @@
 [tool.poetry]
 name = "nsys2prv"
-version = "0.3.1"
+version = "0.4.0-dev20241007"
 description = "Translate a NVIDIA Nsight System trace to a Paraver trace"
 authors = ["Marc Clascà <marc.clasca@bsc.es>"]
 readme = "README.md"
 license = "GPL-3.0-only"
 include = [
-    "cfgs"
+    "cfgs",
+    "docs"
 ]
 repository = "https://pm.bsc.es/gitlab/beppp/nsys2prv"
 homepage = "https://pm.bsc.es/gitlab/beppp/nsys2prv"
-- 
GitLab


From 8f9423ed71871497ad63e9130367b9ba0bf5175d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Mon, 14 Oct 2024 19:13:45 +0200
Subject: [PATCH 7/9] Adds trycatch for MPI and solves error for single report

---
 nsys2prv/parse_nsys_stats.py | 109 ++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 3fc1386..54c95c8 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -57,7 +57,7 @@ def main():
         REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]
         REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report
     else:
-        REPORT_FILE = os.path.abspath(args.source_rep.first)
+        REPORT_FILE = os.path.abspath(args.source_rep[0])
         REPORT_DIR = os.path.dirname(REPORT_FILE)
     
     trace_name = args.output
@@ -228,57 +228,62 @@ def main():
 
     if t_mpi:
         mpi_df = []
-        if MULTIREPORT:
-            for REPORT_FILE_I in REPORTS_LIST:
-                kp2pi = MPIP2PSemantic(REPORT_FILE_I)
-                kp2pi.Setup()
-                kp2pi.load_data()
-
-                kcolli = MPICollSemantic(REPORT_FILE_I)
-                kcolli.Setup()
-                kcolli.load_data()
-
-                kotheri = MPIOtherSemantic(REPORT_FILE_I)
-                kotheri.Setup()
-                kotheri.load_data()
-
-                krmai = MPIRMASemantic(REPORT_FILE_I)
-                krmai.Setup()
-                krmai.load_data()
-
-                kioi = MPIIOPSemantic(REPORT_FILE_I)
-                kioi.Setup()
-                kioi.load_data()
-
-                mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True))
-            del kp2pi, kcolli, kotheri, krmai, kioi
-        else:
-            kmpi = MPIP2PSemantic(REPORT_FILE)
-            kmpi.Setup()
-            kmpi.load_data()
-            mpi_p2p_df = kmpi.get_df()
-
-            kmpi = MPICollSemantic(REPORT_FILE)
-            kmpi.Setup()
-            kmpi.load_data()
-            mpi_coll_df = kmpi.get_df()
-
-            kmpi = MPIOtherSemantic(REPORT_FILE)
-            kmpi.Setup()
-            kmpi.load_data()
-            mpi_other_df = kmpi.get_df()
-
-            kmpi = MPIRMASemantic(REPORT_FILE)
-            kmpi.Setup()
-            kmpi.load_data()
-            mpi_rma_df = kmpi.get_df()
-
-            kmpi = MPIIOPSemantic(REPORT_FILE)
-            kmpi.Setup()
-            kmpi.load_data()
-            mpi_io_df = kmpi.get_df()
-            mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True)
-            del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df
+        try:
+            if MULTIREPORT:
+                for REPORT_FILE_I in REPORTS_LIST:
+                    kp2pi = MPIP2PSemantic(REPORT_FILE_I)
+                    kp2pi.Setup()
+                    kp2pi.load_data()
+
+                    kcolli = MPICollSemantic(REPORT_FILE_I)
+                    kcolli.Setup()
+                    kcolli.load_data()
+
+                    kotheri = MPIOtherSemantic(REPORT_FILE_I)
+                    kotheri.Setup()
+                    kotheri.load_data()
+
+                    krmai = MPIRMASemantic(REPORT_FILE_I)
+                    krmai.Setup()
+                    krmai.load_data()
+
+                    kioi = MPIIOPSemantic(REPORT_FILE_I)
+                    kioi.Setup()
+                    kioi.load_data()
+
+                    mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True))
+                del kp2pi, kcolli, kotheri, krmai, kioi
+            else:
+                kmpi = MPIP2PSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_p2p_df = kmpi.get_df()
+
+                kmpi = MPICollSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_coll_df = kmpi.get_df()
+
+                kmpi = MPIOtherSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_other_df = kmpi.get_df()
+
+                kmpi = MPIRMASemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_rma_df = kmpi.get_df()
+
+                kmpi = MPIIOPSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_io_df = kmpi.get_df()
+                mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True)
+                del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df
+        except OperationalError as oe:
+            print("There has been a problem fetching MPI information. MPI data will be skipped.")
+            print(f"[ERROR]: {oe.args[0]}")
+            t_mpi = False
     else:
         mpi_df = pd.DataFrame()
 
-- 
GitLab


From 18a46502c8d6fcd2aff8a139c6ebeecaa77fb796 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Wed, 6 Nov 2024 17:22:26 +0100
Subject: [PATCH 8/9] Adds support for GPU metrics multi report merging when
 each report contains metrics for random GPU ids not corresponding to self
 process model. This still not works for multi-node multi-report traces, where
 multiple devices with same id might appear.

---
 .vscode/launch.json                        | 32 ++++++++++++++++++
 nsys2prv/parse_nsys_stats.py               | 38 +++++++++++++---------
 nsys2prv/semantics/gpu_metrics_semantic.py | 13 +++++---
 3 files changed, 64 insertions(+), 19 deletions(-)
 create mode 100644 .vscode/launch.json

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..95c3117
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,32 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "nsys2prv/parse_nsys_stats.py",
+            "console": "integratedTerminal",
+            "args": [
+                "-t",
+                "cuda_api_trace,mpi_event_trace,gpu_metrics",
+                "-m",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_0.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_1.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_2.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_3.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_4.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_5.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_6.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_7.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_ricardo_metrics_4nodes_more"
+            ],
+            "env": {
+                "NSYS_HOME": "/home/mclasca/Apps/nsight-system/2024.5.1"
+            }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 54c95c8..d96b3ae 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -11,10 +11,10 @@ import locale
 from functools import reduce
 from sqlalchemy import create_engine, text, dialects
 from sqlalchemy.exc import OperationalError
-from .EventWriter import event_writer as ewr
-from .NSYSInterface import NSYSInterface
-from .semantics.mpi_event_encoding import *
-from .semantics import *
+from EventWriter import event_writer as ewr
+from NSYSInterface import NSYSInterface
+from semantics.mpi_event_encoding import *
+from semantics import *
 
 def main():
     locale.setlocale(locale.LC_ALL, '')
@@ -288,20 +288,23 @@ def main():
         mpi_df = pd.DataFrame()
 
     gpu_metrics_agg = []
+    metrics_event_names = []
     if t_metrics:
         if MULTIREPORT:
-            raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!")
+            #raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!")
             for REPORT_FILE_I in REPORTS_LIST:
                 ksi = GPUMetricsSemantic(REPORT_FILE_I)
                 ksi.Setup()
                 ksi.load_data()
                 gpu_metrics_agg.append(ksi.get_df())
+                metrics_event_names.append(ksi.get_names())
                 del ksi
         else:
             ks = GPUMetricsSemantic(REPORT_FILE)
             ks.Setup()
             ks.load_data()
             gpu_metrics_agg = ks.get_df()
+            metrics_event_names = ks.get_names()
             del ks
 
     if t_openacc:
@@ -352,7 +355,7 @@ def main():
     context_info.sort_values(["processId"], inplace=True)
     
     # CONTEXT INFO CHECK FOR MULTIREPORT
-    if context_info["deviceId"].unique().size == 0:
+    if context_info["deviceId"].unique().size == 1:
         print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \033[00m")
 
     if t_mpi:
@@ -379,7 +382,6 @@ def main():
         for i, df in enumerate(kernels_df):
             df['Start (ns)'] += deltas[i]
         kernels_df = pd.concat(kernels_df, ignore_index=True)
-        print(f"After concat: {kernels_df.shape}")
 
         if t_apicalls:
             for i, df in enumerate(cuda_api_df):
@@ -419,9 +421,16 @@ def main():
             openacc_launch_df = pd.concat(openacc_launch_df, ignore_index=True)
             openacc_data_df = pd.concat(openacc_data_df, ignore_index=True)
         
-        #if t_metrics:
-
-        
+        if t_metrics:
+            for i, df in enumerate(gpu_metrics_agg):
+                if not df.empty:
+                    df['timestamp'] += deltas[i]
+                    # Complement with processId info
+                    #df['Pid'] = list_contexts[i].iloc[0]['processId'] # Assuming one report per GPU device. This needs to be improved.
+                    
+                    df['Pid'] = df['deviceId'].map(context_info.set_index("deviceId")["processId"])
+            gpu_metrics_agg = pd.concat(gpu_metrics_agg, ignore_index=True)
+            metrics_event_names = pd.concat(metrics_event_names, ignore_index=True).drop_duplicates()
 
     # MARK: PROCESS MODEL
 
@@ -530,8 +539,7 @@ def main():
     devices_set = streams.groupby(["Pid", "deviceid"]).agg({'Device': 'first',
                                         'Strm': lambda x: set(x),
                                             'thread': 'count',
-                                            'task': 'first',
-                                            'Pid': 'last'})
+                                            'task': 'first'})
     print(devices_set)
 
     # Here we finally update the threadId we are going to put in the event record of kernel executions to respect the normal threads before CUDA streams
@@ -557,12 +565,12 @@ def main():
 
     # Add auxiliary stream to streams dataframe
     if t_metrics:
-        aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task"]]
+        aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task", "Pid"]]
         aux_streams["Strm"] = 99
         aux_streams["row_name"] = "Metrics GPU"+aux_streams["deviceid"].astype(str)
-        aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"])
+        #aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"])
         aux_streams["thread"] = aux_streams["thread"] + aux_streams["deviceid"].map(tasks_set.set_index('device')['thread']) + 1
-        gpu_metrics_agg["task"] = gpu_metrics_agg["deviceId"].map(devices_set["task"])
+        gpu_metrics_agg["task"] = gpu_metrics_agg["Pid"].map(devices_set.reset_index().set_index("Pid")["task"])
         gpu_metrics_agg["thread"] = gpu_metrics_agg["task"].map(aux_streams.set_index('task')["thread"])
         streams = pd.concat([streams, aux_streams]).sort_values(['task', 'thread'])
 
diff --git a/nsys2prv/semantics/gpu_metrics_semantic.py b/nsys2prv/semantics/gpu_metrics_semantic.py
index 097682a..6776bb4 100644
--- a/nsys2prv/semantics/gpu_metrics_semantic.py
+++ b/nsys2prv/semantics/gpu_metrics_semantic.py
@@ -1,5 +1,5 @@
 from .nsys_event import NsysEvent
-from pandas import read_sql_table
+from pandas import read_sql_table, DataFrame
 from sqlalchemy import text
 
 event_type_metrics_base = 9400
@@ -7,22 +7,27 @@ event_type_metrics_base = 9400
 
 class GPUMetricsSemantic(NsysEvent):
     def __init__(self, report) -> None:
+        self.metrics_event_names = DataFrame()
         super().__init__(report)
     
     def Setup(self):
         if self.check_table("GPU_METRICS"):
             self.query = text("SELECT * FROM GPU_METRICS")
         else:
+            
             self._empty = True
     
     def _preprocess(self):
         metrics_description = read_sql_table("TARGET_INFO_GPU_METRICS", self._dbcon)
         self._df.drop(self._df[self._df["timestamp"] < 0].index, inplace=True) # drop negative time
-        metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index()
-        metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base
+        self.metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index()
+        self.metrics_event_names["metricId"] = self.metrics_event_names["metricId"] + event_type_metrics_base
         self._df["deviceId"] = self._df["typeId"].apply(lambda x: x & 0xFF)
         self._df = self._df.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base),
                                                                         'value': lambda x: list(x),
                                                                         'deviceId': 'first'})
         self._df.reset_index(inplace=True)
-        return super()._preprocess()
\ No newline at end of file
+        return super()._preprocess()
+    
+    def get_names(self):
+        return self.metrics_event_names
\ No newline at end of file
-- 
GitLab


From 99a4c89b4e21193111744053ae9417a8d329fdf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Clasc=C3=A0?= <marc.clasca@bsc.es>
Date: Wed, 6 Nov 2024 21:32:19 +0100
Subject: [PATCH 9/9] Solves GPU metrics merging for multireport and multinode
 systems. Solves #2

---
 nsys2prv/parse_nsys_stats.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index d96b3ae..48858fe 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -11,10 +11,10 @@ import locale
 from functools import reduce
 from sqlalchemy import create_engine, text, dialects
 from sqlalchemy.exc import OperationalError
-from EventWriter import event_writer as ewr
-from NSYSInterface import NSYSInterface
-from semantics.mpi_event_encoding import *
-from semantics import *
+from .EventWriter import event_writer as ewr
+from .NSYSInterface import NSYSInterface
+from .semantics.mpi_event_encoding import *
+from .semantics import *
 
 def main():
     locale.setlocale(locale.LC_ALL, '')
@@ -159,12 +159,6 @@ def main():
     # MARK: IMPORT DATASETS
     print("Importing datasets...")
 
-    # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace"))
-    # kernels_df.rename(columns={"CorrId": "CorrID"}, inplace=True)
-    # with engine.connect() as conn, conn.begin():
-    #     with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query:
-    #         kernels_df = pd.read_sql_query(text(query.read()), conn)
-
     kernels_df = []
     if MULTIREPORT:
         sum = 0
@@ -291,7 +285,6 @@ def main():
     metrics_event_names = []
     if t_metrics:
         if MULTIREPORT:
-            #raise NotImplementedError("Translating GPU metrics information for multi-report is not yet supported!")
             for REPORT_FILE_I in REPORTS_LIST:
                 ksi = GPUMetricsSemantic(REPORT_FILE_I)
                 ksi.Setup()
@@ -345,9 +338,14 @@ def main():
 
     # MARK: CONTEXT INFO
     list_contexts = []
+    list_hostnames = []
     if MULTIREPORT:
         for REPORT_FILE_I in REPORTS_LIST:
             context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+            target_system_env_i = pd.read_sql_table("TARGET_INFO_SYSTEM_ENV", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+            hostname = target_system_env_i.loc[target_system_env_i["name"] == "Hostname"]["value"].iloc[0]
+            context_info_i["hostname"] = hostname
+            list_hostnames.append(hostname)
             list_contexts.append(context_info_i)
         context_info = pd.concat(list_contexts)
     else:
@@ -355,6 +353,7 @@ def main():
     context_info.sort_values(["processId"], inplace=True)
     
     # CONTEXT INFO CHECK FOR MULTIREPORT
+    #if context_info.groupby(["hostname"]).agg({'deviceId': 'count'})
     if context_info["deviceId"].unique().size == 1:
         print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \033[00m")
 
@@ -425,10 +424,8 @@ def main():
             for i, df in enumerate(gpu_metrics_agg):
                 if not df.empty:
                     df['timestamp'] += deltas[i]
-                    # Complement with processId info
-                    #df['Pid'] = list_contexts[i].iloc[0]['processId'] # Assuming one report per GPU device. This needs to be improved.
-                    
-                    df['Pid'] = df['deviceId'].map(context_info.set_index("deviceId")["processId"])
+                    # Complement with processId and node info
+                    df['Pid'] = df['deviceId'].map(context_info[context_info["hostname"] == list_hostnames[i]].set_index("deviceId")["processId"])
             gpu_metrics_agg = pd.concat(gpu_metrics_agg, ignore_index=True)
             metrics_event_names = pd.concat(metrics_event_names, ignore_index=True).drop_duplicates()
 
@@ -569,7 +566,7 @@ def main():
         aux_streams["Strm"] = 99
         aux_streams["row_name"] = "Metrics GPU"+aux_streams["deviceid"].astype(str)
         #aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"])
-        aux_streams["thread"] = aux_streams["thread"] + aux_streams["deviceid"].map(tasks_set.set_index('device')['thread']) + 1
+        aux_streams["thread"] = aux_streams["thread"] + aux_streams["Pid"].map(tasks_set.set_index('Pid')['thread']) + 1
         gpu_metrics_agg["task"] = gpu_metrics_agg["Pid"].map(devices_set.reset_index().set_index("Pid")["task"])
         gpu_metrics_agg["thread"] = gpu_metrics_agg["task"].map(aux_streams.set_index('task')["thread"])
         streams = pd.concat([streams, aux_streams]).sort_values(['task', 'thread'])
-- 
GitLab