diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000000000000000000000000000000000000..95c3117c7c10c8bf7719b6404e5b50edf5fe09fb
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,32 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "nsys2prv/parse_nsys_stats.py",
+            "console": "integratedTerminal",
+            "args": [
+                "-t",
+                "cuda_api_trace,mpi_event_trace,gpu_metrics",
+                "-m",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_0.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_1.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_2.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_3.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_4.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_5.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_6.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_7.nsys-rep",
+                "../proves/multi_4nodes_gpumetrics_11finalistep_1maxiter/sod2d_ricardo_metrics_4nodes_more"
+            ],
+            "env": {
+                "NSYS_HOME": "/home/mclasca/Apps/nsight-system/2024.5.1"
+            }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/nsys2prv/NSYSInterface.py b/nsys2prv/NSYSInterface.py
new file mode 100644
index 0000000000000000000000000000000000000000..9034cf7debdc4f0566a65a8f004add68eb9c6425
--- /dev/null
+++ b/nsys2prv/NSYSInterface.py
@@ -0,0 +1,67 @@
+import subprocess
+import os
+
+class NSYSInterface():
+
+    def __init__(self, types, filter_nvtx, range_nvtx, force_sqlite):
+        self.use_path = True
+        self.nsys_binary = ("nsys",)
+        
+        if 'NSYS_HOME' in os.environ:
+            self.NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME'))
+            self.use_path = False
+        
+        if self.use_path:
+            self.nsys_binary = ("nsys",)
+        else:
+            self.nsys_binary = (os.path.join(self.NSYS_HOME, "bin/nsys"),)
+
+        self.types  = types
+        self.filter = filter_nvtx
+        self.range_nvtx = range_nvtx
+        self.force = force_sqlite
+
+    def check_export_report(self, rf):
+        if not os.path.exists(f"{os.path.splitext(os.path.basename(rf))[0]}.sqlite") or self.force:
+            #Try exporting first
+            export_call = self.nsys_binary + ("export", "-t", "sqlite", rf)
+            try:
+                with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
+                    for line in p.stdout:
+                        print(line.decode(), end='')
+                
+                if p.returncode != 0:
+                    raise ChildProcessError(p.returncode, p.args)        
+            except FileNotFoundError:
+                print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
+                exit(1)
+            except ChildProcessError:
+                print("Could not export SQLite database. Exiting.")
+                exit(1)
+
+    def call_stats(self, report):
+        nsys_call = self.nsys_binary + ("stats", "-r", ",".join(self.types), 
+            "--timeunit", "nsec", "-f", "csv", 
+            "--force-overwrite", "true", "-o", ".")
+        if self.filter:
+            nsys_call += ("--filter-nvtx="+self.range_nvtx,)
+
+        nsys_call += (report,)
+
+        try:
+            with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
+                for line in p.stdout:
+                    print(line.decode(), end='')
+
+            if p.returncode != 0:
+                raise ChildProcessError(p.returncode, p.args)
+        except FileNotFoundError:
+            print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
+            exit(1)
+
+    def build_nsys_stats_name(self, rf, rd, report_name):
+        base_name = os.path.splitext(os.path.basename(rf))[0]
+        if self.filter:
+            return os.path.join(rd, base_name+"_{}_nvtx={}.csv".format(report_name, self.range_nvtx))
+        else:
+            return os.path.join(rd, base_name+"_{}.csv".format(report_name))
\ No newline at end of file
diff --git a/nsys2prv/parse_nsys_stats.py b/nsys2prv/parse_nsys_stats.py
index 8a2566008c06d800d145b6de8ee76b67188eabc0..48858feb37289aee14406ae9e14917c704774e04 100755
--- a/nsys2prv/parse_nsys_stats.py
+++ b/nsys2prv/parse_nsys_stats.py
@@ -8,11 +8,13 @@ import time
 import subprocess
 import os
 import locale
+from functools import reduce
 from sqlalchemy import create_engine, text, dialects
 from sqlalchemy.exc import OperationalError
 from .EventWriter import event_writer as ewr
+from .NSYSInterface import NSYSInterface
 from .semantics.mpi_event_encoding import *
-
+from .semantics import *
 
 def main():
     locale.setlocale(locale.LC_ALL, '')
@@ -31,6 +33,7 @@ def main():
     parser.add_argument("-v", "--version",  nargs=0, help="Show version and exit.", action=ShowVersion)
     parser.add_argument("-f", "--filter-nvtx", help="Filter by this NVTX range")
     parser.add_argument("-t", "--trace", help="Comma separated names of events to translate: [mpi_event_trace, nvtx_pushpop_trace, nvtx_startend_trace, cuda_api_trace, gpu_metrics, openacc]")
+    parser.add_argument("-m", "--multi-report", action="store_true", help="Translate multiple reports of the same execution into one trace.")
 
     parser.add_argument("--force-sqlite", action="store_true", help="Force Nsight System to export SQLite database")
 
@@ -39,23 +42,24 @@ def main():
 
     #parser.add_argument("-n", "--nvtx-stack-range", nargs=2, type=int)
 
-    parser.add_argument("source_rep", help="Nsight source report file")
+    parser.add_argument("source_rep", nargs="+", help="Nsight source report file")
     parser.add_argument("output", help="Paraver output trace name")
 
     args = parser.parse_args()
 
     # # Trace configuration and setup
-
-    use_path = True
-
-    if 'NSYS_HOME' in os.environ:
-        NSYS_HOME = os.path.abspath(os.getenv('NSYS_HOME'))
-        use_path = False
     
     PARAVER_HOME = os.getenv('PARAVER_HOME')
 
-    REPORT_FILE = os.path.abspath(args.source_rep)
-    REPORT_DIR = os.path.dirname(REPORT_FILE)
+    MULTIREPORT = args.multi_report
+    if MULTIREPORT:
+        REPORTS_LIST = [os.path.abspath(x) for x in args.source_rep]
+        REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]
+        REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report
+    else:
+        REPORT_FILE = os.path.abspath(args.source_rep[0])
+        REPORT_DIR = os.path.dirname(REPORT_FILE)
+    
     trace_name = args.output
 
     NVTX_FILTER = args.filter_nvtx != None
@@ -126,37 +130,18 @@ def main():
     nvtx_stack_top = 1
     nvtx_stack_bottom = 4
 
+    nsi = NSYSInterface(reports, NVTX_FILTER, NVTX_RANGE, args.force_sqlite)
 
-    def build_nsys_stats_name(report_name):
-        base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]
-        if NVTX_FILTER:
-            return os.path.join(REPORT_DIR, base_name+"_{}_nvtx={}.csv".format(report_name, NVTX_RANGE))
-        else:
-            return os.path.join(REPORT_DIR, base_name+"_{}.csv".format(report_name))
-
-
+    if MULTIREPORT:
+        print(f"Multiple reports provided: {REPORTS_LIST}")
     print("Extracting reports for: {}".format(reports_og))
-    if use_path:
-        nsys_binary = ("nsys",)
-    else:
-        nsys_binary = (os.path.join(NSYS_HOME, "bin/nsys"),)
     
-    if not os.path.exists(f"{os.path.splitext(os.path.basename(REPORT_FILE))[0]}.sqlite"):
-        #Try exporting first
-        export_call = nsys_binary + ("export", "-t", "sqlite", REPORT_FILE)
-        try:
-            with subprocess.Popen(export_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
-                for line in p.stdout:
-                    print(line.decode(), end='')
-            
-            if p.returncode != 0:
-                raise ChildProcessError(p.returncode, p.args)        
-        except FileNotFoundError:
-            print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
-            exit(1)
-        except ChildProcessError:
-            print("Could not export SQLite database. Exiting.")
-            exit(1)
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            print(f"Exporting SQLite databse for {os.path.basename(REPORT_FILE_I)}")
+            nsi.check_export_report(REPORT_FILE_I)
+    else:
+        nsi.check_export_report(REPORT_FILE)
 
     engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
     metadata = pd.read_sql_table("META_DATA_EXPORT", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
@@ -164,149 +149,297 @@ def main():
     if int(minor_version["value"].iloc[0]) > 11:
         print(f"\033[93m Warning! The SQLite schema version {int(minor_version["value"].iloc[0])} is greater than the one supported (11). If unexpected behaviour occurs, please report it. \033[00m")
 
-    nsys_call = nsys_binary + ("stats", "-r", ",".join(reports), 
-                "--timeunit", "nsec", "-f", "csv", 
-                "--force-overwrite", "true", "-o", ".")
-
-    if NVTX_FILTER:
-        nsys_call += ("--filter-nvtx="+NVTX_RANGE,)
-
-    if args.force_sqlite:
-        nsys_call += ("--force-export", "true")
-
-    nsys_call += (REPORT_FILE,)
-
-    try:
-        with subprocess.Popen(nsys_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
-            for line in p.stdout:
-                print(line.decode(), end='')
-
-        if p.returncode != 0:
-            raise ChildProcessError(p.returncode, p.args)
-    except FileNotFoundError:
-        print("You don't have an Nsight Systems installation in your PATH. Please install, Nsight Systems, or locate your installation using PATH or setting NSYS_HOME environment variable.")
-        exit(1)
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            print(f"Processing stats for {os.path.basename(REPORT_FILE_I)}")
+            nsi.call_stats(REPORT_FILE_I)
+    else:
+        nsi.call_stats(REPORT_FILE)
 
+    # MARK: IMPORT DATASETS
     print("Importing datasets...")
 
-    # kernels_df = pd.read_csv(build_nsys_stats_name("cuda_gpu_trace"))
-    # kernels_df.rename(columns={"CorrId": "CorrID"}, inplace=True)
-    with engine.connect() as conn, conn.begin():
-        with open(os.path.join(os.path.dirname(__file__), 'scripts/kernels.sql'), 'r') as query:
-            kernels_df = pd.read_sql_query(text(query.read()), conn)
+    kernels_df = []
+    if MULTIREPORT:
+        sum = 0
+        for REPORT_FILE_I in REPORTS_LIST:
+            ksi = KernelsSemantic(REPORT_FILE_I)
+            ksi.Setup()
+            ksi.load_data()
+            kernels_df.append(ksi.get_df())
+            sum += ksi.get_df().shape[0]
+            del ksi
+    else:
+        ks = KernelsSemantic(REPORT_FILE)
+        ks.Setup()
+        ks.load_data()
+        kernels_df = ks.get_df()
 
 
     if t_apicalls:
-        cuda_api_df = pd.read_csv(build_nsys_stats_name("cuda_api_trace"))
+        cuda_api_df = []
+        if MULTIREPORT:
+            for i, REPORT_FILE_I in enumerate(REPORTS_LIST):
+                cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "cuda_api_trace")))
+        else:
+            cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "cuda_api_trace"))
     else:
         cuda_api_df = pd.DataFrame()
 
     if t_nvtx:
-        nvtx_df = pd.read_csv(build_nsys_stats_name("nvtx_pushpop_trace"))
-        nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0]
+        nvtx_df = []
+        if MULTIREPORT:
+            for i, REPORT_FILE_I in enumerate(REPORTS_LIST):
+                nvtx_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], "nvtx_pushpop_trace")))
+                nvtx_df[i]["domain"] = nvtx_df[i]["Name"].str.split(":").str[0]
+                nvtx_df[i].rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
+
+        else:
+            nvtx_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, "nvtx_pushpop_trace"))
+            nvtx_df["domain"] = nvtx_df["Name"].str.split(":").str[0]
+            nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
+
     else:
         nvtx_df = pd.DataFrame()
 
     if t_nvtx_startend:
-        with engine.connect() as conn, conn.begin():
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/nvtx_startend_trace.sql'), 'r') as query:
-                nvtx_startend_df = pd.read_sql_query(text(query.read()), conn)
+        nvtx_startend_df = []
+        if MULTIREPORT:
+            for REPORT_FILE_I in REPORTS_LIST:
+                ksi = NVTXStartEndSemantic(REPORT_FILE_I)
+                ksi.Setup()
+                ksi.load_data()
+                nvtx_startend_df.append(ksi.get_df())
+                del ksi
+        else:
+            ks = NVTXStartEndSemantic(REPORT_FILE)
+            ks.Setup()
+            ks.load_data()
+            nvtx_startend_df = ks.get_df()
+            del ks
     else:
         nvtx_startend_df = pd.DataFrame()
 
     if t_mpi:
-        with engine.connect() as conn, conn.begin():
-            try:
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_p2p.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_P2P_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_START_WAIT_EVENTS'):
-                        mpi_p2p_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_p2p_df["event_type"] = MPITYPE_PTOP
-                    else: mpi_p2p_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_coll.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'):
-                        mpi_coll_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_coll_df = mpi_coll_df.drop(mpi_coll_df[mpi_coll_df["Event"].str.contains("File") ].index)
-                        mpi_coll_df["event_type"] = MPITYPE_COLLECTIVE
-                    else: mpi_coll_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'):
-                        mpi_other_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("File") ].index)
-                        mpi_other_df = mpi_other_df.drop(mpi_other_df[mpi_other_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index)
-                        mpi_other_df["event_type"] = MPITYPE_OTHER
-                    else: mpi_other_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_other.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS'):
-                        mpi_rma_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_rma_df = mpi_rma_df[mpi_rma_df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")]
-                        mpi_rma_df["event_type"] = MPITYPE_RMA
-                    else: mpi_rma_df = pd.DataFrame()
-                with open(os.path.join(os.path.dirname(__file__), 'scripts/mpi_io.sql'), 'r') as query:
-                    if conn.dialect.has_table(connection=conn, table_name='MPI_OTHER_EVENTS') and conn.dialect.has_table(connection=conn, table_name='MPI_COLLECTIVES_EVENTS'):
-                        mpi_io_df = pd.read_sql_query(text(query.read()), conn)
-                        mpi_io_df = mpi_io_df[mpi_io_df["Event"].str.contains("File")]
-                        mpi_io_df["event_type"] = MPITYPE_IO
-                    else: mpi_io_df = pd.DataFrame()
-                mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df])
-            except OperationalError as oe:
-                print("There has been a problem fetching MPI information. MPI data will be skipped.")
-                print(f"[ERROR]: {oe.detail}")
-                t_mpi = False
-        #mpi_df = pd.read_csv(build_nsys_stats_name("mpi_event_trace"))
+        mpi_df = []
+        try:
+            if MULTIREPORT:
+                for REPORT_FILE_I in REPORTS_LIST:
+                    kp2pi = MPIP2PSemantic(REPORT_FILE_I)
+                    kp2pi.Setup()
+                    kp2pi.load_data()
+
+                    kcolli = MPICollSemantic(REPORT_FILE_I)
+                    kcolli.Setup()
+                    kcolli.load_data()
+
+                    kotheri = MPIOtherSemantic(REPORT_FILE_I)
+                    kotheri.Setup()
+                    kotheri.load_data()
+
+                    krmai = MPIRMASemantic(REPORT_FILE_I)
+                    krmai.Setup()
+                    krmai.load_data()
+
+                    kioi = MPIIOPSemantic(REPORT_FILE_I)
+                    kioi.Setup()
+                    kioi.load_data()
+
+                    mpi_df.append(pd.concat([kp2pi.get_df(), kcolli.get_df(), kotheri.get_df(), kotheri.get_df(), krmai.get_df(), kioi.get_df()], ignore_index=True))
+                del kp2pi, kcolli, kotheri, krmai, kioi
+            else:
+                kmpi = MPIP2PSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_p2p_df = kmpi.get_df()
+
+                kmpi = MPICollSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_coll_df = kmpi.get_df()
+
+                kmpi = MPIOtherSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_other_df = kmpi.get_df()
+
+                kmpi = MPIRMASemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_rma_df = kmpi.get_df()
+
+                kmpi = MPIIOPSemantic(REPORT_FILE)
+                kmpi.Setup()
+                kmpi.load_data()
+                mpi_io_df = kmpi.get_df()
+                mpi_df = pd.concat([mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df], ignore_index=True)
+                del kmpi, mpi_p2p_df, mpi_coll_df, mpi_other_df, mpi_rma_df, mpi_io_df
+        except OperationalError as oe:
+            print("There has been a problem fetching MPI information. MPI data will be skipped.")
+            print(f"[ERROR]: {oe.args[0]}")
+            t_mpi = False
     else:
-        #mpi_df = pd.DataFrame()
-        mpi_p2p_df = pd.DataFrame()
-        mpi_coll_df = pd.DataFrame()
-        mpi_other_df = pd.DataFrame()
-        mpi_rma_df = pd.DataFrame()
-        mpi_io_df = pd.DataFrame()
-
-    # Obtain context Info
-    context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
+        mpi_df = pd.DataFrame()
+
+    gpu_metrics_agg = []
+    metrics_event_names = []
+    if t_metrics:
+        if MULTIREPORT:
+            for REPORT_FILE_I in REPORTS_LIST:
+                ksi = GPUMetricsSemantic(REPORT_FILE_I)
+                ksi.Setup()
+                ksi.load_data()
+                gpu_metrics_agg.append(ksi.get_df())
+                metrics_event_names.append(ksi.get_names())
+                del ksi
+        else:
+            ks = GPUMetricsSemantic(REPORT_FILE)
+            ks.Setup()
+            ks.load_data()
+            gpu_metrics_agg = ks.get_df()
+            metrics_event_names = ks.get_names()
+            del ks
+
+    if t_openacc:
+        if MULTIREPORT:
+            openacc_other_df = []
+            openacc_launch_df = []
+            openacc_data_df = []
+            for REPORT_FILE_I in REPORTS_LIST:
+                ksio = OpenACCOtherSemantic(REPORT_FILE_I)
+                ksio.Setup()
+                ksio.load_data()
+                openacc_other_df.append(ksio.get_df())
+                ksil = OpenACCLaunchSemantic(REPORT_FILE_I)
+                ksil.Setup()
+                ksil.load_data()
+                openacc_launch_df.append(ksil.get_df())
+                ksid = OpenACCDataSemantic(REPORT_FILE_I)
+                ksid.Setup()
+                ksid.load_data()
+                openacc_data_df.append(ksid.get_df())
+                del ksio, ksil, ksid
+        else:
+            kso = OpenACCOtherSemantic(REPORT_FILE_I)
+            kso.Setup()
+            kso.load_data()
+            openacc_other_df = kso.get_df()
+            ksl = OpenACCLaunchSemantic(REPORT_FILE_I)
+            ksl.Setup()
+            ksl.load_data()
+            openacc_launch_df = ksl.get_df()
+            ksd = OpenACCDataSemantic(REPORT_FILE_I)
+            ksd.Setup()
+            ksd.load_data()
+            openacc_data_df = ksd.get_df()
+            del kso, ksl, ksd
+        openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", f"sqlite:///{os.path.splitext(REPORTS_LIST[0])[0]}.sqlite")
+
+
+    # MARK: CONTEXT INFO
+    list_contexts = []
+    list_hostnames = []
+    if MULTIREPORT:
+        for REPORT_FILE_I in REPORTS_LIST:
+            context_info_i = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+            target_system_env_i = pd.read_sql_table("TARGET_INFO_SYSTEM_ENV", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+            hostname = target_system_env_i.loc[target_system_env_i["name"] == "Hostname"]["value"].iloc[0]
+            context_info_i["hostname"] = hostname
+            list_hostnames.append(hostname)
+            list_contexts.append(context_info_i)
+        context_info = pd.concat(list_contexts)
+    else:
+        context_info = pd.read_sql_table("TARGET_INFO_CUDA_CONTEXT_INFO", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
+    context_info.sort_values(["processId"], inplace=True)
+    
+    # CONTEXT INFO CHECK FOR MULTIREPORT
+    #if context_info.groupby(["hostname"]).agg({'deviceId': 'count'})
+    if context_info["deviceId"].unique().size == 1:
+        print(f"\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \033[00m")
+
     if t_mpi:
-        mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;"
-        with engine.connect() as conn, conn.begin():
-            rank_info = pd.read_sql_query(mpi_query, conn)
+        if MULTIREPORT:
+            list_ranks = []
+            for REPORT_FILE_I in REPORTS_LIST:
+                mpi_query = "SELECT globalTid / 0x1000000 % 0x1000000 AS Pid, globalTid % 0x1000000 AS Tid, rank FROM MPI_RANKS;"
+                engine = create_engine(f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite")
+                with engine.connect() as conn, conn.begin():
+                    list_ranks.append(pd.read_sql_query(mpi_query, conn))
+            rank_info = pd.concat(list_ranks)
     
-    context_info.sort_values(["processId"], inplace=True)
 
-    if t_metrics:
-        gpu_metrics = pd.read_sql_table("GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
-        metrics_description = pd.read_sql_table("TARGET_INFO_GPU_METRICS", f"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite")
-        gpu_metrics.drop(gpu_metrics[gpu_metrics["timestamp"] < 0].index, inplace=True) # drop negative time
-        metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index()
-        metrics_event_names["metricId"] = metrics_event_names["metricId"] + event_type_metrics_base
-        #gpu_metrics["task"] = gpu_metrics.groupby(["typeId"]).ngroup() + 1
-        gpu_metrics["deviceId"] = gpu_metrics["typeId"].apply(lambda x: x & 0xFF)
-        gpu_metrics_agg = gpu_metrics.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base),
-                                                                        'value': lambda x: list(x),
-                                                                        'deviceId': 'first'})
-        gpu_metrics_agg.reset_index(inplace=True)
+    # MARK: MERGING AND ALIGNING
+    if MULTIREPORT:
+        # Find delta between earliest trace start and the others
+        session_time = []
+        for REPORT_FILE_I in REPORTS_LIST:
+            session_time.append(pd.read_sql_table("TARGET_INFO_SESSION_START_TIME", f"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite"))
+        
+        session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs
+        earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf'))
+        deltas = [start - earliest_time for start in session_time]
+        for i, df in enumerate(kernels_df):
+            df['Start (ns)'] += deltas[i]
+        kernels_df = pd.concat(kernels_df, ignore_index=True)
+
+        if t_apicalls:
+            for i, df in enumerate(cuda_api_df):
+                df['Start (ns)'] += deltas[i]
 
+            cuda_api_df = pd.concat(cuda_api_df, ignore_index=True)
 
-    if t_openacc:
-        with engine.connect() as conn, conn.begin():
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_other.sql'), 'r') as query:
-                openacc_other_df = pd.read_sql_query(text(query.read()), conn)
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_launch.sql'), 'r') as query:
-                openacc_launch_df = pd.read_sql_query(text(query.read()), conn)
-            with open(os.path.join(os.path.dirname(__file__), 'scripts/openacc_data.sql'), 'r') as query:
-                openacc_data_df = pd.read_sql_query(text(query.read()), conn)
-            openacc_event_kind = pd.read_sql_table("ENUM_OPENACC_EVENT_KIND", conn)
+        if t_nvtx:
+            for i, df in enumerate(nvtx_df):
+                df['Start (ns)'] += deltas[i]
+                df['End (ns)'] += deltas[i]
+            nvtx_df = pd.concat(nvtx_df, ignore_index=True)
+
+        if t_nvtx_startend:
+            for i, df in enumerate(nvtx_startend_df):
+                df['Start (ns)'] += deltas[i]
+                df['End (ns)'] += deltas[i]
+            nvtx_startend_df = pd.concat(nvtx_startend_df, ignore_index=True)
 
+        if t_mpi:
+            for i, df in enumerate(mpi_df):
+                df['Start:ts_ns'] += deltas[i]
+                df['End:ts_ns'] += deltas[i]
+            mpi_df = pd.concat(mpi_df, ignore_index=True)
+        
+        if t_openacc:
+            for i, df in enumerate(openacc_other_df):
+                df['start'] += deltas[i]
+                df['end'] += deltas[i]
+            for i, df in enumerate(openacc_launch_df):
+                df['start'] += deltas[i]
+                df['end'] += deltas[i]
+            for i, df in enumerate(openacc_data_df):
+                df['start'] += deltas[i]
+                df['end'] += deltas[i]
+            openacc_other_df = pd.concat(openacc_other_df, ignore_index=True)
+            openacc_launch_df = pd.concat(openacc_launch_df, ignore_index=True)
+            openacc_data_df = pd.concat(openacc_data_df, ignore_index=True)
+        
+        if t_metrics:
+            for i, df in enumerate(gpu_metrics_agg):
+                if not df.empty:
+                    df['timestamp'] += deltas[i]
+                    # Complement with processId and node info
+                    df['Pid'] = df['deviceId'].map(context_info[context_info["hostname"] == list_hostnames[i]].set_index("deviceId")["processId"])
+            gpu_metrics_agg = pd.concat(gpu_metrics_agg, ignore_index=True)
+            metrics_event_names = pd.concat(metrics_event_names, ignore_index=True).drop_duplicates()
 
-    # # Building object model
+    # MARK: PROCESS MODEL
 
     # ## Tasks and threads
     # Now, find unique appearences of ThreadID and ProcessID
 
     if t_apicalls: print("CUDA calls unique processes: {}, and unique threads: {}".format(cuda_api_df["Pid"].unique(), cuda_api_df["Tid"].unique()))
-    if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["PID"].unique(), nvtx_df["TID"].unique()))
+    if t_nvtx: print("NVTX ranges unique processes: {}, and unique threads: {}".format(nvtx_df["Pid"].unique(), nvtx_df["Tid"].unique()))
     if t_nvtx_startend: print("NVTX startend unique processes: {}, and unique threads: {}".format(nvtx_startend_df["Pid"].unique(), nvtx_startend_df["Tid"].unique()))
     if t_mpi: print("MPI calls unique processes: {}, and unique threads: {}".format(mpi_df["Pid"].unique(), mpi_df["Tid"].unique()))
     if t_openacc: print("OpenACC calls unique processes: {}, and unique threads: {}".format(openacc_other_df["Pid"].unique(), openacc_other_df["Tid"].unique()))
 
-    if t_nvtx: nvtx_df.rename(columns={"PID":"Pid", "TID":"Tid"}, inplace=True)
 
     compute_threads_with = []
     if t_apicalls: compute_threads_with.append(cuda_api_df[['Pid', 'Tid']])
@@ -332,6 +465,7 @@ def main():
                                         'Tid': lambda x: set(x),
                                             'thread': 'count',
                                             'device': 'first' })
+    print(tasks_set)
 
     cuda_api_df["thread"] = 0
     cuda_api_df["task"] = 0
@@ -389,50 +523,51 @@ def main():
 
 
     streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates()
-    streams["thread"] = streams.groupby(["Device"]).cumcount() + 1
+    streams["thread"] = streams.groupby(["Pid", "Device"]).cumcount() + 1
     #streams["deviceid"] = streams.sort_values("Device").groupby(["Device"]).ngroup()
     #streams["Pid"] = streams["deviceid"].map(tasks_set.set_index("device")["Pid"])
-    streams["task"] = streams["deviceid"].map(tasks_set.reset_index().set_index("device")["task"])
+    streams["task"] = streams["Pid"].map(tasks_set.reset_index().set_index("Pid")["task"])
 
     streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str)
     num_streams = streams.count().iloc[0]
     streams.sort_values(["Pid", "thread"], inplace=True)
     streams.reset_index(inplace=True)
 
-    devices_set = streams.groupby(["deviceid"]).agg({'Device': 'first',
+    devices_set = streams.groupby(["Pid", "deviceid"]).agg({'Device': 'first',
                                         'Strm': lambda x: set(x),
                                             'thread': 'count',
-                                            'task': 'first',
-                                            'Pid': 'last'})
+                                            'task': 'first'})
+    print(devices_set)
 
     # Here we finally update the threadId we are going to put in the event record of kernel executions to respect the normal threads before CUDA streams
 
     num_normal_threads = tasks_set['thread']
     num_normal_threads_repeated = num_normal_threads.repeat(devices_set["thread"]).reset_index()[["thread"]]
 
-
     streams['thread'] = streams['thread'] + num_normal_threads_repeated["thread"]
+
     # for index,row in kernels_df.iterrows():
     #     kernels_df.at[index, "thread"] = streams.at[(streams["Strm"] == row["Strm"]).idxmax(), "thread"]
     #     kernels_df.at[index, "deviceid"] = streams.at[(streams["Device"] == row["Device"]).idxmax(), "deviceid"]
 
     # More efficient way by chatgpt
     # First, let's filter streams DataFrame based on conditions
-    filtered_streams = streams.groupby(["Device", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index()
+    filtered_streams = streams.groupby(["Pid", "Strm"]).agg({'thread':'first', 'task':'first'}).reset_index()
     # Now, merge the filtered streams DataFrame with kernels_df
-    result_df = kernels_df.merge(filtered_streams, how='left', on=['Device', 'Strm'])
+    result_df = kernels_df.merge(filtered_streams, how='left', on=["Pid", 'Strm'])
+
     # Copy the results back to kernels_df
-    kernels_df['thread'] = result_df['thread']
-    kernels_df['task'] = result_df['task']
+    kernels_df['thread'] = result_df['thread'].to_numpy()
+    kernels_df['task'] = result_df['task'].to_numpy()
 
     # Add auxiliary stream to streams dataframe
     if t_metrics:
-        aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task"]]
+        aux_streams = devices_set.reset_index()[["deviceid", "Device", "thread", "task", "Pid"]]
         aux_streams["Strm"] = 99
         aux_streams["row_name"] = "Metrics GPU"+aux_streams["deviceid"].astype(str)
-        aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"])
-        aux_streams["thread"] = aux_streams["thread"] + aux_streams["deviceid"].map(tasks_set.set_index('device')['thread']) + 1
-        gpu_metrics_agg["task"] = gpu_metrics_agg["deviceId"].map(devices_set["task"])
+        #aux_streams["Pid"] = aux_streams["deviceid"].map(tasks_set.set_index('device')["Pid"])
+        aux_streams["thread"] = aux_streams["thread"] + aux_streams["Pid"].map(tasks_set.set_index('Pid')['thread']) + 1
+        gpu_metrics_agg["task"] = gpu_metrics_agg["Pid"].map(devices_set.reset_index().set_index("Pid")["task"])
         gpu_metrics_agg["thread"] = gpu_metrics_agg["task"].map(aux_streams.set_index('task')["thread"])
         streams = pd.concat([streams, aux_streams]).sort_values(['task', 'thread'])
 
@@ -441,9 +576,6 @@ def main():
     # ## Writing ROW file
     # Now we can write the _row_ file with this information
 
-    print(tasks_set)
-    print(devices_set)
-
     print("  -Writing resource model to row file...")
 
     row_df = pd.concat([threads[["thread", "task", "row_name"]], streams[["thread", "task", "row_name"]]])
@@ -460,7 +592,7 @@ def main():
         row_file.write("\n")
 
 
-    # # Collecting event values
+    # MARK: EVENT NAMES
     # Second step is collect all different event values for CUDA API calls, kernel names, and NVTX ranges.  Each of these define a different event type, and will need unique identifiers to be used as a event values.  Finally these needs to be dumped to the PCF file.
 
     print("Collecting event names and information...")
@@ -490,7 +622,7 @@ def main():
     kernel_names["Name"] = kernel_names["Name"].apply(lambda x: x.replace("[", "").replace("]", ""))
 
     if t_nvtx:
-        nvtx_df_subset = nvtx_df
+        nvtx_df_subset = nvtx_df.reset_index()
         lower_level = max(nvtx_df["Lvl"])
 
         if nvtx_select_frames:
@@ -791,9 +923,9 @@ GRADIENT_NAMES
                 pcf_file.write("{} {}\n".format(row["func_value"], row["func"]))
             pcf_file.write("\n")
 
+    # MARK: MEMORY
     # # Split of kernel execution between compute and memory
 
-
     memops_names = ["[CUDA memcpy Device-to-Device]", "[CUDA memcpy Device-to-Host]", "[CUDA memcpy Host-to-Device]", "[CUDA memset]", "[CUDA memcpy Peer-to-Peer]"]
     memops_df = kernels_df.loc[kernels_df["Name"].isin(memops_names)]
     mask = ~kernels_df.index.isin(memops_df.index)
@@ -805,6 +937,7 @@ GRADIENT_NAMES
     comm_memory_df = cuda_api_df.merge(memops_df, how="inner", left_on=["CorrID", "task"], right_on=["CorrID", "task"], suffixes=("_call", "_mem"), validate="one_to_one")
 
 
+    # MARK: TIMELINE RECONS
     # # Timeline reconstruction
 
     print("Reconstructing timeline...")
@@ -924,6 +1057,8 @@ GRADIENT_NAMES
 
 
     print(f"Congratulations! Trace {trace_name}.prv correctly translated.")
+    
+    # MARK: POSTPROCESSING
     # ## Postprocessing
     # - Reorder trace
     # - GZip trace
diff --git a/nsys2prv/semantics/__init__.py b/nsys2prv/semantics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..19762807643a1502861a372168ee81c2c5a4bb3d
--- /dev/null
+++ b/nsys2prv/semantics/__init__.py
@@ -0,0 +1,5 @@
+from .kernels_semantic import KernelsSemantic
+from .mpi_semantic import *
+from .nvtx_startend_semantic import NVTXStartEndSemantic
+from .gpu_metrics_semantic import GPUMetricsSemantic
+from .openacc_semantic import *
\ No newline at end of file
diff --git a/nsys2prv/semantics/gpu_metrics_semantic.py b/nsys2prv/semantics/gpu_metrics_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..6776bb446e1d3947ad0538c7c65e5e43c96c40a0
--- /dev/null
+++ b/nsys2prv/semantics/gpu_metrics_semantic.py
@@ -0,0 +1,33 @@
+from .nsys_event import NsysEvent
+from pandas import read_sql_table, DataFrame
+from sqlalchemy import text
+
+event_type_metrics_base = 9400
+
+
+class GPUMetricsSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        self.metrics_event_names = DataFrame()
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("GPU_METRICS"):
+            self.query = text("SELECT * FROM GPU_METRICS")
+        else:
+            
+            self._empty = True
+    
+    def _preprocess(self):
+        metrics_description = read_sql_table("TARGET_INFO_GPU_METRICS", self._dbcon)
+        self._df.drop(self._df[self._df["timestamp"] < 0].index, inplace=True) # drop negative time
+        self.metrics_event_names = metrics_description.groupby(["metricId"]).agg({'metricName': 'first'}).reset_index()
+        self.metrics_event_names["metricId"] = self.metrics_event_names["metricId"] + event_type_metrics_base
+        self._df["deviceId"] = self._df["typeId"].apply(lambda x: x & 0xFF)
+        self._df = self._df.groupby(["timestamp", "typeId"]).agg({'metricId': lambda x: list(x+event_type_metrics_base),
+                                                                        'value': lambda x: list(x),
+                                                                        'deviceId': 'first'})
+        self._df.reset_index(inplace=True)
+        return super()._preprocess()
+    
+    def get_names(self):
+        return self.metrics_event_names
\ No newline at end of file
diff --git a/nsys2prv/semantics/kernels_semantic.py b/nsys2prv/semantics/kernels_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..804128e614b22fb92dc08a23edab57aa0005d505
--- /dev/null
+++ b/nsys2prv/semantics/kernels_semantic.py
@@ -0,0 +1,12 @@
+from .nsys_event import NsysEvent
+import os.path
+from sqlalchemy import text
+
+class KernelsSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/kernels.sql'), 'r') as query:
+            self.query = text(query.read())
+    
\ No newline at end of file
diff --git a/nsys2prv/semantics/mpi_semantic.py b/nsys2prv/semantics/mpi_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4763766e1772da3443b5d4ce32441d0bcc38a33
--- /dev/null
+++ b/nsys2prv/semantics/mpi_semantic.py
@@ -0,0 +1,78 @@
+from .nsys_event import NsysEvent
+import os.path
+from .mpi_event_encoding import *
+from sqlalchemy import text
+
+class MPIP2PSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table('MPI_P2P_EVENTS') and self.check_table('MPI_START_WAIT_EVENTS'):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_p2p.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    def _preprocess(self):
+        self._df["event_type"] = MPITYPE_PTOP
+        return super()._preprocess()
+
+class MPICollSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_COLLECTIVES_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_coll.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index)
+        self._df["event_type"] = MPITYPE_COLLECTIVE
+
+class MPIOtherSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_OTHER_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        self._df = self._df.drop(self._df[self._df["Event"].str.contains("File") ].index)
+        self._df = self._df.drop(self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate") ].index)
+        self._df["event_type"] = MPITYPE_OTHER
+
+class MPIRMASemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_OTHER_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_other.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    def _preprocess(self):
+        self._df = self._df[self._df["Event"].str.contains("Win|MPI_Get|MPI_Put|Accumulate")]
+        self._df["event_type"] = MPITYPE_RMA
+
+class MPIIOPSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        if self.check_table("MPI_OTHER_EVENTS") and self.check_table("MPI_COLLECTIVES_EVENTS"):
+            with open(os.path.join(os.path.dirname(__file__), '../scripts/mpi_io.sql'), 'r') as query:
+                self.query = text(query.read())
+        else:
+            self._empty = True
+    
+    def _preprocess(self):
+        self._df = self._df[self._df["Event"].str.contains("File")]
+        self._df["event_type"] = MPITYPE_IO
\ No newline at end of file
diff --git a/nsys2prv/semantics/nsys_event.py b/nsys2prv/semantics/nsys_event.py
new file mode 100644
index 0000000000000000000000000000000000000000..9813a14ef3f7785712ad84013128e02481cf88b9
--- /dev/null
+++ b/nsys2prv/semantics/nsys_event.py
@@ -0,0 +1,68 @@
+from sqlalchemy import create_engine, exc, inspect
+import pandas as pd
+import os.path
+
+class NsysEvent:
+
+    class MissingDatabaseFile(Exception):
+        def __init__(self, filename):
+            super().__init__(f'Database file {filename} does not exist.')
+
+    class InvalidDatabaseFile(Exception):
+        def __init__(self, filename):
+            super().__init__(f'Database file {filename} could not be opened and appears to be invalid.')
+
+    class InvalidSQL(Exception):
+        def __init__(self, sql):
+            super().__init__(f'Bad SQL statement: {sql}')
+
+    query = "SELECT 1 AS 'ONE'"
+
+    def __init__(self, report) -> None:
+        self._dbcon = None
+        self._dbfile = f"{os.path.splitext(report)[0]}.sqlite"
+        self._df = pd.DataFrame()
+        self._empty = False
+
+        if not os.path.exists(self._dbfile):
+            raise self.MissingDatabaseFile(self._dbfile)
+
+        try:
+            self._dbcon = create_engine(f"sqlite:///{self._dbfile}")
+        except exc.SQLAlchemyError:
+            self._dbcon = None
+            raise self.InvalidDatabaseFile(self._dbfile)
+        
+    def check_table(self, table_name):
+        insp = inspect(self._dbcon)
+        return insp.has_table(table_name)
+
+    def Setup(self):
+        pass
+
+    def _preprocess(self):
+        pass
+
+    def postprocess(self):
+        pass
+
+    def load_data(self):
+        if not self._empty:
+            try:
+                self._df = pd.read_sql_query(self.query, self._dbcon)
+            except pd.errors.DatabaseError:
+                raise self.InvalidSQL(self.query)
+            self._preprocess()
+
+    def apply_process_model(self, threads=pd.DataFrame, streams=pd.DataFrame):
+        self.df["thread"] = self.df["Tid"].map(threads.set_index('Tid')["thread"])
+        self.df["task"] = self.df["Tid"].map(threads.set_index('Tid')["task"])
+        if 'Rank' in threads.columns:
+            self.df["Rank"] = self.df["Tid"].map(threads.set_index('Tid')["Rank"])
+        pass
+
+    def get_threads(self):
+        return self._df[['Pid', 'Tid']].drop_duplicates()
+    
+    def get_df(self):
+        return self._df.copy()
\ No newline at end of file
diff --git a/nsys2prv/semantics/nvtx_startend_semantic.py b/nsys2prv/semantics/nvtx_startend_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..88241b72f2e762462e9117123a3af103ce33bab2
--- /dev/null
+++ b/nsys2prv/semantics/nvtx_startend_semantic.py
@@ -0,0 +1,11 @@
+from .nsys_event import NsysEvent
+import os.path
+from sqlalchemy import text
+
+class NVTXStartEndSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/nvtx_startend_trace.sql'), 'r') as query:
+            self.query = text(query.read())
\ No newline at end of file
diff --git a/nsys2prv/semantics/openacc_semantic.py b/nsys2prv/semantics/openacc_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..105971928ecc9b0f15511d48909b433692ad8989
--- /dev/null
+++ b/nsys2prv/semantics/openacc_semantic.py
@@ -0,0 +1,27 @@
+from .nsys_event import NsysEvent
+import os.path
+from sqlalchemy import text
+
+class OpenACCOtherSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_other.sql'), 'r') as query:
+            self.query = text(query.read())
+    
+class OpenACCLaunchSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_launch.sql'), 'r') as query:
+            self.query = text(query.read())
+
+class OpenACCDataSemantic(NsysEvent):
+    def __init__(self, report) -> None:
+        super().__init__(report)
+    
+    def Setup(self):
+        with open(os.path.join(os.path.dirname(__file__), '../scripts/openacc_data.sql'), 'r') as query:
+            self.query = text(query.read())
\ No newline at end of file
diff --git a/parser-playground.ipynb b/parser-playground.ipynb
index 10a24122f26de9d7757f71c6283f6d662f5ba30c..c4b2ad5e8356d7be3d919434d5587b3f233925d1 100644
--- a/parser-playground.ipynb
+++ b/parser-playground.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,6 +14,10 @@
     "import locale\n",
     "import sqlite3\n",
     "from sqlalchemy import create_engine, text\n",
+    "from nsys2prv.EventWriter import event_writer as ewr\n",
+    "from nsys2prv.NSYSInterface import NSYSInterface\n",
+    "from nsys2prv.semantics.mpi_event_encoding import *\n",
+    "from nsys2prv.semantics import *\n",
     "\n",
     "NSYS_HOME = os.path.abspath(\"/home/mclasca/Apps/nsight-system/2024.5.1/\")\n",
     "#NSIGHT_HOME = os.getenv('NSIGHT_HOME')\n",
@@ -27,6 +31,12 @@
     "#REPORT_NAME=\"heka-step53+accum1-profile-2023.4-5721957\"\n",
     "#REPORT_NAME=\"heka-axolotl-Mistral7B0.1-profile-2110598\"\n",
     "\n",
+    "MULTIREPORT = True\n",
+    "if MULTIREPORT:\n",
+    "    REPORTS_LIST = [os.path.abspath(x) for x in [\"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_0.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_1.nsys-rep\", \"/home/mclasca/Documents/BePPP/heka/proves/multi_2nodes/sod2d_2.nsys-rep\"]]\n",
+    "    REPORT_DIRS_LIST = [os.path.dirname(x) for x in REPORTS_LIST]\n",
+    "    REPORT_FILE = REPORTS_LIST[0] # For fast checks, it's best to have a reference report\n",
+    "\n",
     "locale.setlocale(locale.LC_ALL, '')\n",
     "\n",
     "trace_name = \"test-heka\"\n",
@@ -69,6 +79,7 @@
     "nvtx_stack_bottom = 4\n",
     "\n",
     "reports = [\"cuda_api_trace\", \"cuda_gpu_trace\"]\n",
+    "nsi = NSYSInterface(reports, False, NVTX_RANGE, False)\n",
     "\n",
     "def build_nsys_stats_name(report_name):\n",
     "    base_name = os.path.splitext(os.path.basename(REPORT_FILE))[0]\n",
@@ -686,7 +697,125 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_contexts = []\n",
+    "if MULTIREPORT:\n",
+    "    for REPORT_FILE_I in REPORTS_LIST:\n",
+    "        context_info_i = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\")\n",
+    "        list_contexts.append(context_info_i)\n",
+    "    context_info = pd.concat(list_contexts)\n",
+    "else:\n",
+    "    context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n",
+    "context_info.sort_values([\"processId\"], inplace=True)\n",
+    "\n",
+    "# CONTEXT INFO CHECK FOR MULTIREPORT\n",
+    "if context_info[\"deviceId\"].unique().size == 0:\n",
+    "    print(f\"\\033[93m Warning! Only one unique device ID can be detected in resource identification. If this is not intended, some features will not be available. Please, make sure that the GPU bindings are correctly done and that every process identifies its own GPU with a unique device [0 .. N-1].  \\033[00m\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_apicalls = True\n",
+    "cuda_api_df = []\n",
+    "if MULTIREPORT:\n",
+    "    for i, REPORT_FILE_I in enumerate(REPORTS_LIST):\n",
+    "        cuda_api_df.append(pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE_I, REPORT_DIRS_LIST[i], \"cuda_api_trace\")))\n",
+    "else:\n",
+    "    cuda_api_df = pd.read_csv(nsi.build_nsys_stats_name(REPORT_FILE, REPORT_DIR, \"cuda_api_trace\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernels_df = []\n",
+    "if MULTIREPORT:\n",
+    "    sum = 0\n",
+    "    for REPORT_FILE_I in REPORTS_LIST:\n",
+    "        ksi = KernelsSemantic(REPORT_FILE_I)\n",
+    "        ksi.Setup()\n",
+    "        ksi.load_data()\n",
+    "        kernels_df.append(ksi.get_df())\n",
+    "        sum += ksi.get_df().shape[0]\n",
+    "        del ksi\n",
+    "else:\n",
+    "    ks = KernelsSemantic(REPORT_FILE)\n",
+    "    ks.Setup()\n",
+    "    ks.load_data()\n",
+    "    kernels_df = ks.get_df()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "After concat: (21299, 23)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from functools import reduce\n",
+    "# MARK: MERGING AND ALIGNING\n",
+    "if MULTIREPORT:\n",
+    "    # Find delta between earliest trace start and the others\n",
+    "    session_time = []\n",
+    "    for REPORT_FILE_I in REPORTS_LIST:\n",
+    "        session_time.append(pd.read_sql_table(\"TARGET_INFO_SESSION_START_TIME\", f\"sqlite:///{os.path.splitext(REPORT_FILE_I)[0]}.sqlite\"))\n",
+    "    \n",
+    "    session_time = [x.iloc[0,0] for x in session_time] # Get the utcEpochNs\n",
+    "    earliest_time = reduce(lambda x, y: min(x, y), session_time, float('inf'))\n",
+    "    deltas = [start - earliest_time for start in session_time]\n",
+    "    for i, df in enumerate(kernels_df):\n",
+    "        df['Start (ns)'] += deltas[i]\n",
+    "    kernels_df = pd.concat(kernels_df)\n",
+    "    print(f\"After concat: {kernels_df.shape}\")\n",
+    "\n",
+    "    if t_apicalls:\n",
+    "        for i, df in enumerate(cuda_api_df):\n",
+    "            df['Start (ns)'] += deltas[i]\n",
+    "\n",
+    "        cuda_api_df = pd.concat(cuda_api_df)\n",
+    "\n",
+    "    # if t_nvtx:\n",
+    "    #     for i, df in enumerate(nvtx_df):\n",
+    "    #         df['Start (ns)'] += deltas[i]\n",
+    "    #         df['End (ns)'] += deltas[i]\n",
+    "    #     nvtx_df = pd.concat(nvtx_df)\n",
+    "\n",
+    "    # if t_nvtx_startend:\n",
+    "    #     for i, df in enumerate(nvtx_startend_df):\n",
+    "    #         df['Start (ns)'] += deltas[i]\n",
+    "    #         df['End (ns)'] += deltas[i]\n",
+    "    #     nvtx_startend_df = pd.concat(nvtx_startend_df)\n",
+    "\n",
+    "    # if t_mpi:\n",
+    "    #     for i, df in enumerate(mpi_df):\n",
+    "    #         df['Start:ts_ns'] += deltas[i]\n",
+    "    #         df['End:ts_ns'] += deltas[i]\n",
+    "    #     mpi_df = pd.concat(mpi_df)\n",
+    "    \n",
+    "    #if t_metrics:\n",
+    "\n",
+    "    #if t_openacc:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -710,369 +839,432 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>Start (ns)</th>\n",
-       "      <th>Duration (ns)</th>\n",
-       "      <th>CorrID</th>\n",
-       "      <th>GrdX</th>\n",
-       "      <th>GrdY</th>\n",
-       "      <th>GrdZ</th>\n",
-       "      <th>BlkX</th>\n",
-       "      <th>BlkY</th>\n",
-       "      <th>BlkZ</th>\n",
-       "      <th>Reg/Trd</th>\n",
-       "      <th>...</th>\n",
-       "      <th>DymSMem (MB)</th>\n",
-       "      <th>Bytes (MB)</th>\n",
-       "      <th>Throughput (MB/s)</th>\n",
-       "      <th>SrcMemKd</th>\n",
-       "      <th>DstMemKd</th>\n",
-       "      <th>Device</th>\n",
-       "      <th>Ctx</th>\n",
-       "      <th>GreenCtx</th>\n",
-       "      <th>Strm</th>\n",
-       "      <th>Name</th>\n",
+       "      <th>Pid</th>\n",
+       "      <th>Tid</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>device</th>\n",
        "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1307261431</td>\n",
-       "      <td>992</td>\n",
-       "      <td>688</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>32,258</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "      <th>task</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
        "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1662416719</td>\n",
-       "      <td>960</td>\n",
-       "      <td>1285</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>133,333</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>{2308061}</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1662448494</td>\n",
-       "      <td>736</td>\n",
-       "      <td>1291</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>173,913</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>{2308062}</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1662469326</td>\n",
-       "      <td>768</td>\n",
-       "      <td>1297</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>166,667</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>{2308065}</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1662702764</td>\n",
-       "      <td>704</td>\n",
-       "      <td>1327</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>34,091</td>\n",
-       "      <td>Pageable</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Host-to-Device]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Pid        Tid  thread  device\n",
+       "task                                    \n",
+       "1     2308061  {2308061}       1       0\n",
+       "2     2308062  {2308062}       1       2\n",
+       "3     2308065  {2308065}       1       1"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_threads_with = []\n",
+    "compute_threads_with.append(cuda_api_df[['Pid', 'Tid']])\n",
+    "# if t_nvtx: compute_threads_with.append(nvtx_df[[\"Pid\", \"Tid\"]])\n",
+    "# if t_nvtx_startend: compute_threads_with.append(nvtx_startend_df[[\"Pid\", \"Tid\"]])\n",
+    "# if t_mpi: compute_threads_with.append(mpi_df[[\"Pid\", \"Tid\"]])\n",
+    "# if t_openacc: compute_threads_with.append(openacc_other_df[[\"Pid\", \"Tid\"]])\n",
+    "\n",
+    "t_mpi = False\n",
+    "threads = pd.concat(compute_threads_with).drop_duplicates()\n",
+    "if t_mpi:\n",
+    "    threads[\"Rank\"] = threads[\"Pid\"].map(rank_info.set_index(\"Pid\")[\"rank\"])\n",
+    "    threads.sort_values([\"Rank\"], inplace=True)\n",
+    "else:\n",
+    "    threads.sort_values([\"Pid\"], inplace=True)\n",
+    "threads[\"thread\"] = threads.groupby([\"Pid\"]).cumcount() + 1\n",
+    "threads[\"task\"] = threads.groupby([\"Pid\"]).ngroup() + 1\n",
+    "threads[\"device\"] = threads[\"Pid\"].map(context_info[context_info[\"contextId\"] == 1].set_index(\"processId\")[\"deviceId\"])\n",
+    "#threads.sort_values([\"task\", \"thread\"], inplace=True)\n",
+    "threads.reset_index()\n",
+    "\n",
+    "tasks_set = threads.groupby([\"task\"]).agg({'Pid': 'first',\n",
+    "                                    'Tid': lambda x: set(x),\n",
+    "                                        'thread': 'count',\n",
+    "                                        'device': 'first' })\n",
+    "tasks_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Device</th>\n",
+       "      <th>Strm</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>task</th>\n",
+       "      <th>Pid</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
+       "      <th>Pid</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
        "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>6645</th>\n",
-       "      <td>11824673327</td>\n",
-       "      <td>1378516</td>\n",
-       "      <td>68613</td>\n",
-       "      <td>65535.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>32.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>36.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0,001</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NVIDIA H100 (1)</td>\n",
+       "      <th>2308061</th>\n",
+       "      <th>0</th>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>{16, 17, 18, 19}</td>\n",
+       "      <td>4</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>mod_time_ops_adapt_dt_cfl_32_gpu</td>\n",
+       "      <td>2308061</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6646</th>\n",
-       "      <td>11826052707</td>\n",
-       "      <td>99903</td>\n",
-       "      <td>68614</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>256.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>18.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0,001</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>2308062</th>\n",
+       "      <th>2</th>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>{16, 17, 18, 19}</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2308065</th>\n",
+       "      <th>1</th>\n",
        "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>{16, 17, 18}</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2308065</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                           Device              Strm  thread  task      Pid\n",
+       "Pid     deviceid                                                          \n",
+       "2308061 0         NVIDIA H100 (0)  {16, 17, 18, 19}       4     1  2308061\n",
+       "2308062 2         NVIDIA H100 (2)  {16, 17, 18, 19}       4     2  2308062\n",
+       "2308065 1         NVIDIA H100 (1)      {16, 17, 18}       3     3  2308065"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "streams = kernels_df[['Device', 'Strm', 'deviceid', 'Pid']].drop_duplicates()\n",
+    "streams[\"thread\"] = streams.groupby([\"Pid\", \"Device\"]).cumcount() + 1\n",
+    "#streams[\"deviceid\"] = streams.sort_values(\"Device\").groupby([\"Device\"]).ngroup()\n",
+    "#streams[\"Pid\"] = streams[\"deviceid\"].map(tasks_set.set_index(\"device\")[\"Pid\"])\n",
+    "streams[\"task\"] = streams[\"Pid\"].map(tasks_set.reset_index().set_index(\"Pid\")[\"task\"])\n",
+    "\n",
+    "streams['row_name'] = 'CUDA-D'+streams['deviceid'].astype(str) + '.S' + streams['Strm'].astype(str)\n",
+    "num_streams = streams.count().iloc[0]\n",
+    "streams.sort_values([\"Pid\", \"thread\"], inplace=True)\n",
+    "streams.reset_index(inplace=True)\n",
+    "\n",
+    "devices_set = streams.groupby([\"Pid\", \"deviceid\"]).agg({'Device': 'first',\n",
+    "                                    'Strm': lambda x: set(x),\n",
+    "                                        'thread': 'count',\n",
+    "                                        'task': 'first',\n",
+    "                                        'Pid': 'last'})\n",
+    "devices_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Device</th>\n",
+       "      <th>Strm</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th>Pid</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>task</th>\n",
+       "      <th>row_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>16</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>2</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>CUDA-D0.S16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>190</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>17</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>191</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>204</td>\n",
+       "      <td>NVIDIA H100 (0)</td>\n",
+       "      <td>19</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2308061</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CUDA-D0.S19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
        "      <td>16</td>\n",
-       "      <td>mod_time_ops_adapt_dt_cfl_32_gpu__red</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S16</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6647</th>\n",
-       "      <td>11826167106</td>\n",
-       "      <td>2176</td>\n",
-       "      <td>68616</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>1,838</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>Pageable</td>\n",
+       "      <th>5</th>\n",
+       "      <td>190</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>17</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>191</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>18</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>203</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>19</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>5</td>\n",
+       "      <td>2</td>\n",
+       "      <td>CUDA-D2.S19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>0</td>\n",
        "      <td>NVIDIA H100 (1)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>CUDA-D1.S16</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6648</th>\n",
-       "      <td>11826178754</td>\n",
-       "      <td>2176</td>\n",
-       "      <td>68617</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>1,838</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>Pageable</td>\n",
+       "      <th>9</th>\n",
+       "      <td>190</td>\n",
        "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>17</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>CUDA-D1.S17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6649</th>\n",
-       "      <td>11826190114</td>\n",
-       "      <td>2176</td>\n",
-       "      <td>68618</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0,000</td>\n",
-       "      <td>1,838</td>\n",
-       "      <td>Device</td>\n",
-       "      <td>Pageable</td>\n",
+       "      <th>10</th>\n",
+       "      <td>191</td>\n",
        "      <td>NVIDIA H100 (1)</td>\n",
+       "      <td>18</td>\n",
        "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16</td>\n",
-       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2308065</td>\n",
+       "      <td>4</td>\n",
+       "      <td>3</td>\n",
+       "      <td>CUDA-D1.S18</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>6650 rows × 21 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       Start (ns)  Duration (ns)  CorrID     GrdX  GrdY  GrdZ   BlkX  BlkY  \\\n",
-       "0      1307261431            992     688      NaN   NaN   NaN    NaN   NaN   \n",
-       "1      1662416719            960    1285      NaN   NaN   NaN    NaN   NaN   \n",
-       "2      1662448494            736    1291      NaN   NaN   NaN    NaN   NaN   \n",
-       "3      1662469326            768    1297      NaN   NaN   NaN    NaN   NaN   \n",
-       "4      1662702764            704    1327      NaN   NaN   NaN    NaN   NaN   \n",
-       "...           ...            ...     ...      ...   ...   ...    ...   ...   \n",
-       "6645  11824673327        1378516   68613  65535.0   1.0   1.0   32.0   1.0   \n",
-       "6646  11826052707          99903   68614      3.0   1.0   1.0  256.0   1.0   \n",
-       "6647  11826167106           2176   68616      NaN   NaN   NaN    NaN   NaN   \n",
-       "6648  11826178754           2176   68617      NaN   NaN   NaN    NaN   NaN   \n",
-       "6649  11826190114           2176   68618      NaN   NaN   NaN    NaN   NaN   \n",
-       "\n",
-       "      BlkZ  Reg/Trd  ... DymSMem (MB) Bytes (MB) Throughput (MB/s)  SrcMemKd  \\\n",
-       "0      NaN      NaN  ...          NaN      0,000            32,258  Pageable   \n",
-       "1      NaN      NaN  ...          NaN      0,000           133,333  Pageable   \n",
-       "2      NaN      NaN  ...          NaN      0,000           173,913  Pageable   \n",
-       "3      NaN      NaN  ...          NaN      0,000           166,667  Pageable   \n",
-       "4      NaN      NaN  ...          NaN      0,000            34,091  Pageable   \n",
-       "...    ...      ...  ...          ...        ...               ...       ...   \n",
-       "6645   1.0     36.0  ...        0,001        NaN               NaN       NaN   \n",
-       "6646   1.0     18.0  ...        0,001        NaN               NaN       NaN   \n",
-       "6647   NaN      NaN  ...          NaN      0,000             1,838    Device   \n",
-       "6648   NaN      NaN  ...          NaN      0,000             1,838    Device   \n",
-       "6649   NaN      NaN  ...          NaN      0,000             1,838    Device   \n",
-       "\n",
-       "      DstMemKd           Device Ctx  GreenCtx  Strm  \\\n",
-       "0       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "1       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "2       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "3       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "4       Device  NVIDIA H100 (1)   1       NaN    16   \n",
-       "...        ...              ...  ..       ...   ...   \n",
-       "6645       NaN  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6646       NaN  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6647  Pageable  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6648  Pageable  NVIDIA H100 (1)   1       NaN    16   \n",
-       "6649  Pageable  NVIDIA H100 (1)   1       NaN    16   \n",
-       "\n",
-       "                                       Name  \n",
-       "0              [CUDA memcpy Host-to-Device]  \n",
-       "1              [CUDA memcpy Host-to-Device]  \n",
-       "2              [CUDA memcpy Host-to-Device]  \n",
-       "3              [CUDA memcpy Host-to-Device]  \n",
-       "4              [CUDA memcpy Host-to-Device]  \n",
-       "...                                     ...  \n",
-       "6645       mod_time_ops_adapt_dt_cfl_32_gpu  \n",
-       "6646  mod_time_ops_adapt_dt_cfl_32_gpu__red  \n",
-       "6647           [CUDA memcpy Device-to-Host]  \n",
-       "6648           [CUDA memcpy Device-to-Host]  \n",
-       "6649           [CUDA memcpy Device-to-Host]  \n",
-       "\n",
-       "[6650 rows x 21 columns]"
+       "    index           Device  Strm  deviceid      Pid  thread  task     row_name\n",
+       "0       0  NVIDIA H100 (0)    16         0  2308061       2     1  CUDA-D0.S16\n",
+       "1     190  NVIDIA H100 (0)    17         0  2308061       3     1  CUDA-D0.S17\n",
+       "2     191  NVIDIA H100 (0)    18         0  2308061       4     1  CUDA-D0.S18\n",
+       "3     204  NVIDIA H100 (0)    19         0  2308061       5     1  CUDA-D0.S19\n",
+       "4       0  NVIDIA H100 (2)    16         2  2308062       2     2  CUDA-D2.S16\n",
+       "5     190  NVIDIA H100 (2)    17         2  2308062       3     2  CUDA-D2.S17\n",
+       "6     191  NVIDIA H100 (2)    18         2  2308062       4     2  CUDA-D2.S18\n",
+       "7     203  NVIDIA H100 (2)    19         2  2308062       5     2  CUDA-D2.S19\n",
+       "8       0  NVIDIA H100 (1)    16         1  2308065       2     3  CUDA-D1.S16\n",
+       "9     190  NVIDIA H100 (1)    17         1  2308065       3     3  CUDA-D1.S17\n",
+       "10    191  NVIDIA H100 (1)    18         1  2308065       4     3  CUDA-D1.S18"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "kernels_df = pd.read_csv(build_nsys_stats_name(\"cuda_gpu_trace\"))\n",
-    "kernels_df.rename(columns={\"CorrId\": \"CorrID\"}, inplace=True)\n",
-    "kernels_df"
+    "num_normal_threads = tasks_set['thread']\n",
+    "num_normal_threads_repeated = num_normal_threads.repeat(devices_set[\"thread\"]).reset_index()[[\"thread\"]]\n",
+    "streams['thread'] = streams['thread'] + num_normal_threads_repeated[\"thread\"]\n",
+    "streams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Shape of original kernels_df: (21299, 25). Shape of result_df: (21299, 27).\n"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'thread'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
+      "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'thread'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[19], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of original kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Shape of result_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# Copy the results back to kernels_df\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mthread\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mresult_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mthread\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m      8\u001b[0m kernels_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m result_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtask\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mto_numpy()\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of new kernels_df: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkernels_df\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   4101\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m   4104\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
+      "File \u001b[0;32m~/Documents/BePPP/heka/tooling/env/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3807\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3808\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3809\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3810\u001b[0m     ):\n\u001b[1;32m   3811\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3814\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3815\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3816\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3817\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'thread'"
+     ]
+    }
+   ],
+   "source": [
+    "filtered_streams = streams.groupby([\"Pid\", \"Strm\"]).agg({'thread':'first', 'task':'first'}).reset_index()\n",
+    "# Now, merge the filtered streams DataFrame with kernels_df\n",
+    "result_df = kernels_df.merge(filtered_streams, how='left', on=[\"Pid\", 'Strm'])\n",
+    "print(f\"Shape of original kernels_df: {kernels_df.shape}. Shape of result_df: {result_df.shape}.\")\n",
+    "\n",
+    "# Copy the results back to kernels_df\n",
+    "kernels_df['thread'] = result_df['thread'].to_numpy()\n",
+    "kernels_df['task'] = result_df['task'].to_numpy()\n",
+    "print(f\"Shape of new kernels_df: {kernels_df.shape}.\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -1096,48 +1288,614 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>nullStreamId</th>\n",
-       "      <th>hwId</th>\n",
-       "      <th>vmId</th>\n",
-       "      <th>processId</th>\n",
-       "      <th>deviceId</th>\n",
-       "      <th>contextId</th>\n",
-       "      <th>parentContextId</th>\n",
-       "      <th>isGreenContext</th>\n",
+       "      <th>Start (ns)</th>\n",
+       "      <th>Duration:dur_ns</th>\n",
+       "      <th>CorrID</th>\n",
+       "      <th>GrdX</th>\n",
+       "      <th>GrdY</th>\n",
+       "      <th>GrdZ</th>\n",
+       "      <th>BlkX</th>\n",
+       "      <th>BlkY</th>\n",
+       "      <th>BlkZ</th>\n",
+       "      <th>Reg/Trd</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DstMemKd</th>\n",
+       "      <th>Device</th>\n",
+       "      <th>deviceid</th>\n",
+       "      <th>Pid</th>\n",
+       "      <th>Ctx</th>\n",
+       "      <th>GreenCtx</th>\n",
+       "      <th>Strm</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>thread</th>\n",
+       "      <th>task</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>7</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2308065</td>\n",
-       "      <td>1</td>\n",
+       "      <th>2023</th>\n",
+       "      <td>10256971039</td>\n",
+       "      <td>2272</td>\n",
+       "      <td>23576</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Pageable</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
        "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   nullStreamId  hwId  vmId  processId  deviceId  contextId  parentContextId  \\\n",
-       "0             7     0     1    2308065         1          1                0   \n",
+       "    <tr>\n",
+       "      <th>2024</th>\n",
+       "      <td>10257043551</td>\n",
+       "      <td>1105690</td>\n",
+       "      <td>23590</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>32.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_245_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025</th>\n",
+       "      <td>10258164089</td>\n",
+       "      <td>51168</td>\n",
+       "      <td>23595</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_bc_routines_bc_fix_dirichlet_residual_293_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2026</th>\n",
+       "      <td>10258227161</td>\n",
+       "      <td>768</td>\n",
+       "      <td>23598</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memset]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2027</th>\n",
+       "      <td>10258236281</td>\n",
+       "      <td>897563</td>\n",
+       "      <td>23601</td>\n",
+       "      <td>65535.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_271_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2028</th>\n",
+       "      <td>10259135028</td>\n",
+       "      <td>97632</td>\n",
+       "      <td>23602</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>256.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_271_gpu__red</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2029</th>\n",
+       "      <td>10259241107</td>\n",
+       "      <td>2273</td>\n",
+       "      <td>23604</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Pageable</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2030</th>\n",
+       "      <td>10259285939</td>\n",
+       "      <td>768</td>\n",
+       "      <td>23616</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memset]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2031</th>\n",
+       "      <td>10259294836</td>\n",
+       "      <td>412701</td>\n",
+       "      <td>23619</td>\n",
+       "      <td>65535.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_297_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2032</th>\n",
+       "      <td>10259708465</td>\n",
+       "      <td>96064</td>\n",
+       "      <td>23620</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>256.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_297_gpu__red</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2033</th>\n",
+       "      <td>10259813009</td>\n",
+       "      <td>2272</td>\n",
+       "      <td>23622</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Pageable</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[CUDA memcpy Device-to-Host]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2034</th>\n",
+       "      <td>10259840881</td>\n",
+       "      <td>751484</td>\n",
+       "      <td>23634</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>45.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_solver_conjgrad_imex_307_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2035</th>\n",
+       "      <td>10260609197</td>\n",
+       "      <td>149279</td>\n",
+       "      <td>23640</td>\n",
+       "      <td>245704.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>elem_diffu_full_diffusion_ijk_51_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2036</th>\n",
+       "      <td>10260759628</td>\n",
+       "      <td>55584</td>\n",
+       "      <td>23642</td>\n",
+       "      <td>81902.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>elem_diffu_full_diffusion_ijk_52_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2037</th>\n",
+       "      <td>10260829643</td>\n",
+       "      <td>7861560</td>\n",
+       "      <td>23646</td>\n",
+       "      <td>1246149.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>96.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>elem_diffu_full_diffusion_ijk_60_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2038</th>\n",
+       "      <td>10268707939</td>\n",
+       "      <td>8192</td>\n",
+       "      <td>23651</td>\n",
+       "      <td>3329.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_comms_fill_sendbuffer_real_239_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2039</th>\n",
+       "      <td>10268727331</td>\n",
+       "      <td>3392</td>\n",
+       "      <td>23655</td>\n",
+       "      <td>3329.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>128.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>16</td>\n",
+       "      <td>mod_comms_fill_sendbuffer_real_246_gpu</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2040</th>\n",
+       "      <td>10268754723</td>\n",
+       "      <td>4672</td>\n",
+       "      <td>23663</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Device</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[CUDA memcpy Peer-to-Peer]</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2041</th>\n",
+       "      <td>10268759875</td>\n",
+       "      <td>3328</td>\n",
+       "      <td>23669</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Device</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>19</td>\n",
+       "      <td>[CUDA memcpy Peer-to-Peer]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2042</th>\n",
+       "      <td>10268769987</td>\n",
+       "      <td>5568</td>\n",
+       "      <td>23688</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Device</td>\n",
+       "      <td>NVIDIA H100 (2)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2308062</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>18</td>\n",
+       "      <td>[CUDA memcpy Peer-to-Peer]</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>20 rows × 25 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Start (ns)  Duration:dur_ns  CorrID       GrdX  GrdY  GrdZ   BlkX  \\\n",
+       "2023  10256971039             2272   23576        NaN   NaN   NaN    NaN   \n",
+       "2024  10257043551          1105690   23590    81902.0   1.0   1.0  128.0   \n",
+       "2025  10258164089            51168   23595    81902.0   1.0   1.0  128.0   \n",
+       "2026  10258227161              768   23598        NaN   NaN   NaN    NaN   \n",
+       "2027  10258236281           897563   23601    65535.0   1.0   1.0  128.0   \n",
+       "2028  10259135028            97632   23602        1.0   1.0   1.0  256.0   \n",
+       "2029  10259241107             2273   23604        NaN   NaN   NaN    NaN   \n",
+       "2030  10259285939              768   23616        NaN   NaN   NaN    NaN   \n",
+       "2031  10259294836           412701   23619    65535.0   1.0   1.0  128.0   \n",
+       "2032  10259708465            96064   23620        1.0   1.0   1.0  256.0   \n",
+       "2033  10259813009             2272   23622        NaN   NaN   NaN    NaN   \n",
+       "2034  10259840881           751484   23634    81902.0   1.0   1.0  128.0   \n",
+       "2035  10260609197           149279   23640   245704.0   1.0   1.0  128.0   \n",
+       "2036  10260759628            55584   23642    81902.0   1.0   1.0  128.0   \n",
+       "2037  10260829643          7861560   23646  1246149.0   1.0   1.0   96.0   \n",
+       "2038  10268707939             8192   23651     3329.0   1.0   1.0  128.0   \n",
+       "2039  10268727331             3392   23655     3329.0   1.0   1.0  128.0   \n",
+       "2040  10268754723             4672   23663        NaN   NaN   NaN    NaN   \n",
+       "2041  10268759875             3328   23669        NaN   NaN   NaN    NaN   \n",
+       "2042  10268769987             5568   23688        NaN   NaN   NaN    NaN   \n",
+       "\n",
+       "      BlkY  BlkZ  Reg/Trd  ...  DstMemKd           Device  deviceid      Pid  \\\n",
+       "2023   NaN   NaN      NaN  ...  Pageable  NVIDIA H100 (2)         2  2308062   \n",
+       "2024   1.0   1.0     32.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2025   1.0   1.0     26.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2026   NaN   NaN      NaN  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2027   1.0   1.0     56.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2028   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2029   NaN   NaN      NaN  ...  Pageable  NVIDIA H100 (2)         2  2308062   \n",
+       "2030   NaN   NaN      NaN  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2031   1.0   1.0     40.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2032   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2033   NaN   NaN      NaN  ...  Pageable  NVIDIA H100 (2)         2  2308062   \n",
+       "2034   1.0   1.0     45.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2035   1.0   1.0     36.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2036   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2037   1.0   1.0     93.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2038   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2039   1.0   1.0     16.0  ...      None  NVIDIA H100 (2)         2  2308062   \n",
+       "2040   NaN   NaN      NaN  ...    Device  NVIDIA H100 (2)         2  2308062   \n",
+       "2041   NaN   NaN      NaN  ...    Device  NVIDIA H100 (2)         2  2308062   \n",
+       "2042   NaN   NaN      NaN  ...    Device  NVIDIA H100 (2)         2  2308062   \n",
+       "\n",
+       "     Ctx GreenCtx Strm                                               Name  \\\n",
+       "2023   1     None   16                       [CUDA memcpy Device-to-Host]   \n",
+       "2024   1     None   16                   mod_solver_conjgrad_imex_245_gpu   \n",
+       "2025   1     None   16  mod_bc_routines_bc_fix_dirichlet_residual_293_gpu   \n",
+       "2026   1     None   16                                      [CUDA memset]   \n",
+       "2027   1     None   16                   mod_solver_conjgrad_imex_271_gpu   \n",
+       "2028   1     None   16              mod_solver_conjgrad_imex_271_gpu__red   \n",
+       "2029   1     None   16                       [CUDA memcpy Device-to-Host]   \n",
+       "2030   1     None   16                                      [CUDA memset]   \n",
+       "2031   1     None   16                   mod_solver_conjgrad_imex_297_gpu   \n",
+       "2032   1     None   16              mod_solver_conjgrad_imex_297_gpu__red   \n",
+       "2033   1     None   16                       [CUDA memcpy Device-to-Host]   \n",
+       "2034   1     None   16                   mod_solver_conjgrad_imex_307_gpu   \n",
+       "2035   1     None   16               elem_diffu_full_diffusion_ijk_51_gpu   \n",
+       "2036   1     None   16               elem_diffu_full_diffusion_ijk_52_gpu   \n",
+       "2037   1     None   16               elem_diffu_full_diffusion_ijk_60_gpu   \n",
+       "2038   1     None   16             mod_comms_fill_sendbuffer_real_239_gpu   \n",
+       "2039   1     None   16             mod_comms_fill_sendbuffer_real_246_gpu   \n",
+       "2040   1     None   17                         [CUDA memcpy Peer-to-Peer]   \n",
+       "2041   1     None   19                         [CUDA memcpy Peer-to-Peer]   \n",
+       "2042   1     None   18                         [CUDA memcpy Peer-to-Peer]   \n",
+       "\n",
+       "      thread  task  \n",
+       "2023       2     2  \n",
+       "2024       2     2  \n",
+       "2025       2     2  \n",
+       "2026       2     2  \n",
+       "2027       2     2  \n",
+       "2028       2     2  \n",
+       "2029       2     2  \n",
+       "2030       2     2  \n",
+       "2031       2     2  \n",
+       "2032       2     2  \n",
+       "2033       2     2  \n",
+       "2034       2     2  \n",
+       "2035       2     2  \n",
+       "2036       2     2  \n",
+       "2037       2     2  \n",
+       "2038       2     2  \n",
+       "2039       2     2  \n",
+       "2040       3     2  \n",
+       "2041       5     2  \n",
+       "2042       4     2  \n",
        "\n",
-       "   isGreenContext  \n",
-       "0               0  "
+       "[20 rows x 25 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "context_info = pd.read_sql_table(\"TARGET_INFO_CUDA_CONTEXT_INFO\", f\"sqlite:///{os.path.splitext(REPORT_FILE)[0]}.sqlite\")\n",
-    "context_info"
+    "kernels_df.iloc[16000:16020]"
    ]
   },
   {
diff --git a/pyproject.toml b/pyproject.toml
index 5b3f16037a220e03d9a328eb0eb4e795df96cc38..d0827c318d5ac718dced6b568a287fbdc46152da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,13 @@
 [tool.poetry]
 name = "nsys2prv"
-version = "0.3.1"
+version = "0.4.0-dev20241007"
 description = "Translate a NVIDIA Nsight System trace to a Paraver trace"
 authors = ["Marc Clascà <marc.clasca@bsc.es>"]
 readme = "README.md"
 license = "GPL-3.0-only"
 include = [
-    "cfgs"
+    "cfgs",
+    "docs"
 ]
 repository = "https://pm.bsc.es/gitlab/beppp/nsys2prv"
 homepage = "https://pm.bsc.es/gitlab/beppp/nsys2prv"