From c3da3e213e0a89b2704351a875a6cfb2fc205d75 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 17 Mar 2025 22:59:12 -0600
Subject: [PATCH 01/43] fmt: `ruff format dristi/includes/module.py`

---
 drishti/includes/module.py | 2003 +++++++++++++++++++++++-------------
 1 file changed, 1314 insertions(+), 689 deletions(-)

diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index 9c2df16..c0d91ef 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -8,130 +8,178 @@
 from rich.syntax import Syntax
 from drishti.includes.config import *
 
-'''
+"""
 Before calling the functions below
 Make sure the variables passed are in the given structure:
 file_map: a dict of (id, path) pair
 modules: a set or a dict should be ok
 detected_files: A pandas dataframe
-'''
+"""
 
 # Basic usage check
 
+
 def check_stdio(total_size, total_size_stdio):
-    '''
+    """
     Check whether the application has excessively utilized standard input/output operations
 
     Parameters:
         total_size: total I/O size
         total_size_stdio: total STDIO size
-    
-    '''
-
-    if total_size and total_size_stdio / total_size > thresholds['interface_stdio'][0]:
-        thresholds['interface_stdio'][1] = True
-        issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format(
-            total_size_stdio / total_size * 100.0,
-            convert_bytes(total_size_stdio)
+
+    """
+
+    if total_size and total_size_stdio / total_size > thresholds["interface_stdio"][0]:
+        thresholds["interface_stdio"][1] = True
+        issue = "Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})".format(
+            total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio)
         )
 
         recommendation = [
             {
-                'message': 'Consider switching to a high-performance I/O interface such as MPI-IO'
+                "message": "Consider switching to a high-performance I/O interface such as MPI-IO"
             }
         ]
 
         insights_operation.append(
-            message(INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation
+            )
         )
 
 
 def check_mpiio(modules):
-    '''
+    """
     Check whether the application has used MPI-IO or not
 
     Parameter:
         modules: all different mudules been used in the application
-    '''
+    """
 
-    if 'MPI-IO' not in modules:
-        issue = 'Application is using low-performance interface'
+    if "MPI-IO" not in modules:
+        issue = "Application is using low-performance interface"
 
         recommendation = [
             {
-                'message': 'Consider switching to a high-performance I/O interface such as MPI-IO'
+                "message": "Consider switching to a high-performance I/O interface such as MPI-IO"
             }
         ]
 
         insights_operation.append(
-            message(INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation)
+            message(
+                INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation
+            )
         )
 
 
-
 # POSIX level check
 
 
 def check_operation_intensive(total_operations, total_reads, total_writes):
-    '''
+    """
     Check whether the application is read or write intensive
 
     Parameters:
         total_operations: number of I/O operations been executed by the application
         total_reads: number of read operations been executed by the application
         total_writes: number of write operations been executed by the application
-    '''
-
-    if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]:
-        issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format(
-            total_writes / total_operations * 100.0, total_reads / total_operations * 100.0
+    """
+
+    if (
+        total_writes > total_reads
+        and total_operations
+        and abs(total_writes - total_reads) / total_operations
+        > thresholds["imbalance_operations"][0]
+    ):
+        issue = "Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format(
+            total_writes / total_operations * 100.0,
+            total_reads / total_operations * 100.0,
         )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+            message(
+                INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE,
+                TARGET_DEVELOPER,
+                INFO,
+                issue,
+                None,
+            )
         )
 
-    if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]:
-        issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format(
-            total_writes / total_operations * 100.0, total_reads / total_operations * 100.0
+    if (
+        total_reads > total_writes
+        and total_operations
+        and abs(total_writes - total_reads) / total_operations
+        > thresholds["imbalance_operations"][0]
+    ):
+        issue = "Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format(
+            total_writes / total_operations * 100.0,
+            total_reads / total_operations * 100.0,
         )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+            message(
+                INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None
+            )
         )
 
 
 def check_size_intensive(total_size, total_read_size, total_written_size):
-    '''
+    """
     Check whether the application is read size intensive or written size intensive
 
     Parameters:
         total_size: Total I/O size measured in byte
         total_read_size: Input I/O size measured in byte
         total_written_size: Output I/O size measured in byte
-    '''
-
-    if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]:
-        issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format(
-            total_written_size / total_size * 100.0, total_read_size / total_size * 100.0
+    """
+
+    if (
+        total_written_size > total_read_size
+        and abs(total_written_size - total_read_size) / total_size
+        > thresholds["imbalance_operations"][0]
+    ):
+        issue = "Application is write size intensive ({:.2f}% write vs. {:.2f}% read)".format(
+            total_written_size / total_size * 100.0,
+            total_read_size / total_size * 100.0,
         )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+            message(
+                INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None
+            )
         )
 
-    if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]:
-        issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format(
-            total_written_size / total_size * 100.0, total_read_size / total_size * 100.0
+    if (
+        total_read_size > total_written_size
+        and abs(total_written_size - total_read_size) / total_size
+        > thresholds["imbalance_operations"][0]
+    ):
+        issue = "Application is read size intensive ({:.2f}% write vs. {:.2f}% read)".format(
+            total_written_size / total_size * 100.0,
+            total_read_size / total_size * 100.0,
         )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+            message(
+                INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None
+            )
         )
 
 
-def check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
-    '''
+def check_small_operation(
+    total_reads,
+    total_reads_small,
+    total_writes,
+    total_writes_small,
+    detected_files,
+    modules,
+    file_map,
+    dxt_posix=None,
+    dxt_posix_read_data=None,
+    dxt_posix_write_data=None,
+):
+    """
     Check whether application has performed an excessive number of small operations
 
     Parameters:
@@ -139,17 +187,21 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
         total_reads_small: number of read operations that has small size
         total_writes: number of write operations been executed by the application
         total_writes_small: number of write operations that has small size
-        detected_files: 
+        detected_files:
             total_reads and total_writes in each file
             required columns: ['id', 'total_reads', 'total_writes']
         modules: all different mudules been used in the application
         file_map: file id and file name pairing
         df_posix: all POSIX records
-    '''
-
-    if total_reads_small and total_reads_small / total_reads > thresholds['small_requests'][0] and total_reads_small > thresholds['small_requests_absolute'][0]:
-        thresholds['small_requests_absolute'][1] = True
-        issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format(
+    """
+
+    if (
+        total_reads_small
+        and total_reads_small / total_reads > thresholds["small_requests"][0]
+        and total_reads_small > thresholds["small_requests_absolute"][0]
+    ):
+        thresholds["small_requests_absolute"][1] = True
+        issue = "Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests".format(
             total_reads_small, total_reads_small / total_reads * 100.0
         )
 
@@ -159,63 +211,93 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
         dxt_trigger_time = 0
 
         for index, row in detected_files.iterrows():
-            if row['total_reads'] > (total_reads * thresholds['small_requests'][0] / 2):
+            if row["total_reads"] > (total_reads * thresholds["small_requests"][0] / 2):
                 detail.append(
                     {
-                        'message': '{} ({:.2f}%) small read requests are to "{}"'.format(
-                            row['total_reads'],
-                            row['total_reads'] / total_reads * 100.0,
-                            file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                        ) 
+                        "message": '{} ({:.2f}%) small read requests are to "{}"'.format(
+                            row["total_reads"],
+                            row["total_reads"] / total_reads * 100.0,
+                            file_map[int(row["id"])]
+                            if args.full_path
+                            else os.path.basename(file_map[int(row["id"])]),
+                        )
                     }
                 )
 
                 # DXT Analysis
                 if args.backtrace:
                     start = time.time()
-                    if file_count < thresholds['backtrace'][0]:
-                        temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
-                        temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])]
-
-                        if not temp_df.empty: 
-                            temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]]
-                            small_read_requests_ranks = temp_df['rank'].unique()
-                            if len(small_read_requests_ranks) > 0:  
-                                if len(small_read_requests_ranks) > 1 and int(small_read_requests_ranks[0]) == 0:
-                                    rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[1]))]
+                    if file_count < thresholds["backtrace"][0]:
+                        temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+                        temp_df = dxt_posix_read_data.loc[
+                            dxt_posix_read_data["id"] == int(row["id"])
+                        ]
+
+                        if not temp_df.empty:
+                            temp_df = temp_df.loc[
+                                temp_df["length"] < thresholds["small_requests"][0]
+                            ]
+                            small_read_requests_ranks = temp_df["rank"].unique()
+                            if len(small_read_requests_ranks) > 0:
+                                if (
+                                    len(small_read_requests_ranks) > 1
+                                    and int(small_read_requests_ranks[0]) == 0
+                                ):
+                                    rank_df = temp.loc[
+                                        (
+                                            temp["rank"]
+                                            == int(small_read_requests_ranks[1])
+                                        )
+                                    ]
                                 else:
-                                    rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[0]))]
-                            
-                                rank_df = rank_df['read_segments'].iloc[0]
-                                rank_addresses = rank_df['stack_memory_addresses'].iloc[0]
-                                address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                                    rank_df = temp.loc[
+                                        (
+                                            temp["rank"]
+                                            == int(small_read_requests_ranks[0])
+                                        )
+                                    ]
+
+                                rank_df = rank_df["read_segments"].iloc[0]
+                                rank_addresses = rank_df["stack_memory_addresses"].iloc[
+                                    0
+                                ]
+                                address = dxt_posix.iloc[0]["address_line_mapping"][
+                                    "address"
+                                ]
                                 res = set(list(address)) & set(rank_addresses)
-                                backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-                        
+                                backtrace = dxt_posix.iloc[0][
+                                    "address_line_mapping"
+                                ].loc[
+                                    dxt_posix.iloc[0]["address_line_mapping"][
+                                        "address"
+                                    ].isin(res)
+                                ]
+
                         if len(small_read_requests_ranks) > 0:
                             detail.append(
                                 {
-                                    'message': '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format(
+                                    "message": '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format(
                                         len(small_read_requests_ranks),
-                                        file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                                    ) 
+                                        file_map[int(row["id"])]
+                                        if args.full_path
+                                        else os.path.basename(file_map[int(row["id"])]),
+                                    )
                                 }
                             )
-                            
+
                             for index, row in backtrace.iterrows():
                                 detail.append(
                                     {
-                                        'message': '{}: {}'.format(
-                                            row['function_name'],
-                                            row['line_number']
-                                        ) 
+                                        "message": "{}: {}".format(
+                                            row["function_name"], row["line_number"]
+                                        )
                                     }
                                 )
                         file_count += 1
                     else:
                         detail.append(
                             {
-                                'message': 'The backtrace information for this file is similar to the previous files'
+                                "message": "The backtrace information for this file is similar to the previous files"
                             }
                         )
 
@@ -223,40 +305,57 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
                     time_taken = end - start
                     dxt_trigger_time += time_taken
 
-        if dxt_trigger_time > 0:            
+        if dxt_trigger_time > 0:
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(dxt_trigger_time, 5)
+                    )
                 }
             )
 
         recommendation.append(
             {
-                'message': 'Consider buffering read operations into larger more contiguous ones'
+                "message": "Consider buffering read operations into larger more contiguous ones"
             }
         )
 
-        if 'MPI-IO' in modules:
+        if "MPI-IO" in modules:
             recommendation.append(
                 {
-                    'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default')
+                    "message": "Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/mpi-io-collective-read.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
         else:
             recommendation.append(
                 {
-                    'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations'
+                    "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations"
                 }
             )
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
-    if total_writes_small and total_writes_small / total_writes > thresholds['small_requests'][0] and total_writes_small > thresholds['small_requests_absolute'][0]:
-        thresholds['small_requests_absolute'][1] = True
-        issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format(
+    if (
+        total_writes_small
+        and total_writes_small / total_writes > thresholds["small_requests"][0]
+        and total_writes_small > thresholds["small_requests_absolute"][0]
+    ):
+        thresholds["small_requests_absolute"][1] = True
+        issue = "Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests".format(
             total_writes_small, total_writes_small / total_writes * 100.0
         )
 
@@ -266,106 +365,162 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
         recommendation = []
         file_count = 0
         for index, row in detected_files.iterrows():
-            if row['total_writes'] > (total_writes * thresholds['small_requests'][0] / 2):
+            if row["total_writes"] > (
+                total_writes * thresholds["small_requests"][0] / 2
+            ):
                 detail.append(
                     {
-                        'message': '{} ({:.2f}%) small write requests are to "{}"'.format(
-                            row['total_writes'],
-                            row['total_writes'] / total_writes * 100.0,
-                            file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                        ) 
+                        "message": '{} ({:.2f}%) small write requests are to "{}"'.format(
+                            row["total_writes"],
+                            row["total_writes"] / total_writes * 100.0,
+                            file_map[int(row["id"])]
+                            if args.full_path
+                            else os.path.basename(file_map[int(row["id"])]),
+                        )
                     }
                 )
 
                 # DXT Analysis
                 if args.backtrace:
                     start = time.time()
-                    if file_count < thresholds['backtrace'][0]:
-                        temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
-                        temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])]
-
-                        if not temp_df.empty: 
-                            temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]]
-                            small_write_requests_ranks = temp_df['rank'].unique()   
+                    if file_count < thresholds["backtrace"][0]:
+                        temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+                        temp_df = dxt_posix_write_data.loc[
+                            dxt_posix_write_data["id"] == int(row["id"])
+                        ]
+
+                        if not temp_df.empty:
+                            temp_df = temp_df.loc[
+                                temp_df["length"] < thresholds["small_requests"][0]
+                            ]
+                            small_write_requests_ranks = temp_df["rank"].unique()
                             if len(small_write_requests_ranks) > 0:
-                                if int(small_write_requests_ranks[0]) == 0 and len(small_write_requests_ranks) > 1:
-                                    rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[1]))]
+                                if (
+                                    int(small_write_requests_ranks[0]) == 0
+                                    and len(small_write_requests_ranks) > 1
+                                ):
+                                    rank_df = temp.loc[
+                                        (
+                                            temp["rank"]
+                                            == int(small_write_requests_ranks[1])
+                                        )
+                                    ]
                                 else:
-                                    rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))] 
-                                
-                                rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))]
-                                rank_df = rank_df['write_segments'].iloc[0]
-                                rank_addresses = rank_df['stack_memory_addresses'].iloc[0]
-                                address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                                    rank_df = temp.loc[
+                                        (
+                                            temp["rank"]
+                                            == int(small_write_requests_ranks[0])
+                                        )
+                                    ]
+
+                                rank_df = temp.loc[
+                                    (temp["rank"] == int(small_write_requests_ranks[0]))
+                                ]
+                                rank_df = rank_df["write_segments"].iloc[0]
+                                rank_addresses = rank_df["stack_memory_addresses"].iloc[
+                                    0
+                                ]
+                                address = dxt_posix.iloc[0]["address_line_mapping"][
+                                    "address"
+                                ]
                                 res = set(list(address)) & set(rank_addresses)
-                                backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-                            
+                                backtrace = dxt_posix.iloc[0][
+                                    "address_line_mapping"
+                                ].loc[
+                                    dxt_posix.iloc[0]["address_line_mapping"][
+                                        "address"
+                                    ].isin(res)
+                                ]
+
                         if len(small_write_requests_ranks) > 0:
                             detail.append(
                                 {
-                                    'message': '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format(
+                                    "message": '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format(
                                         len(small_write_requests_ranks),
-                                        file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                                    ) 
+                                        file_map[int(row["id"])]
+                                        if args.full_path
+                                        else os.path.basename(file_map[int(row["id"])]),
+                                    )
                                 }
                             )
-                            
+
                             for index, row in backtrace.iterrows():
                                 detail.append(
                                     {
-                                        'message': '{}: {}'.format(
-                                            row['function_name'],
-                                            row['line_number']
-                                        ) 
+                                        "message": "{}: {}".format(
+                                            row["function_name"], row["line_number"]
+                                        )
                                     }
                                 )
-                        
+
                         file_count += 1
                     else:
                         detail.append(
                             {
-                                'message': 'The backtrace information for this file is similar to previous files'
+                                "message": "The backtrace information for this file is similar to previous files"
                             }
                         )
 
                     end = time.time()
                     time_taken = end - start
                     dxt_trigger_time += time_taken
-        
+
         if dxt_trigger_time > 0:
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(dxt_trigger_time, 5)
+                    )
                 }
             )
 
         recommendation.append(
             {
-                'message': 'Consider buffering write operations into larger more contiguous ones'
+                "message": "Consider buffering write operations into larger more contiguous ones"
             }
         )
 
-        if 'MPI-IO' in modules:
+        if "MPI-IO" in modules:
             recommendation.append(
                 {
-                    'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default')
+                    "message": "Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/mpi-io-collective-write.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
         else:
             recommendation.append(
                 {
-                    'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations'
+                    "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations"
                 }
             )
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
 
-def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map=None, df_lustre=None, dxt_posix=None, dxt_posix_read_data=None):
-    '''
+def check_misaligned(
+    total_operations,
+    total_mem_not_aligned,
+    total_file_not_aligned,
+    modules,
+    file_map=None,
+    df_lustre=None,
+    dxt_posix=None,
+    dxt_posix_read_data=None,
+):
+    """
     Check whether application has excessive misaligned operations
 
     Parameters:
@@ -373,62 +528,80 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali
         total_mem_not_aligned: number of memory requests not aligned
         total_file_not_aligned: number of file requests not aligned
         modules: all different mudules been used in the application
-    '''
-
-    if total_operations and total_mem_not_aligned / total_operations > thresholds['misaligned_requests'][0]:
-        thresholds['misaligned_requests'][1] = True
-        issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format(
+    """
+
+    if (
+        total_operations
+        and total_mem_not_aligned / total_operations
+        > thresholds["misaligned_requests"][0]
+    ):
+        thresholds["misaligned_requests"][1] = True
+        issue = "Application has a high number ({:.2f}%) of misaligned memory requests".format(
             total_mem_not_aligned / total_operations * 100.0
         )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None)
+            message(
+                INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                None,
+            )
         )
 
-    if total_operations and total_file_not_aligned / total_operations > thresholds['misaligned_requests'][0]:
-        thresholds['misaligned_requests'][1] = True
-        issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format(
+    if (
+        total_operations
+        and total_file_not_aligned / total_operations
+        > thresholds["misaligned_requests"][0]
+    ):
+        thresholds["misaligned_requests"][1] = True
+        issue = "Application issues a high number ({:.2f}%) of misaligned file requests".format(
             total_file_not_aligned / total_operations * 100.0
         )
 
         recommendation = [
             {
-                'message': 'Consider aligning the requests to the file system block boundaries'
+                "message": "Consider aligning the requests to the file system block boundaries"
             }
         ]
 
-        if 'HF5' in modules:
+        if "HF5" in modules:
             recommendation.append(
                 {
-                    'message': 'Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-alignment.c'), line_numbers=True, background_color='default')
+                    "message": "Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/hdf5-alignment.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 },
                 {
-                    'message': 'Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment'
-                }
+                    "message": "Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment"
+                },
             )
 
         detail = []
-        if 'LUSTRE' in modules:
+        if "LUSTRE" in modules:
             # DXT Analysis
             if args.backtrace:
                 start = time.time()
-                
-                if not df_lustre['counters']['LUSTRE_STRIPE_SIZE'].empty:
-                    stripe_size = df_lustre['counters']['LUSTRE_STRIPE_SIZE'].iloc[0]
+
+                if not df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].empty:
+                    stripe_size = df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].iloc[0]
                 else:
-                    stripe_size = df_lustre['counters']['POSIX_FILE_ALIGNMENT'].iloc[0]
+                    stripe_size = df_lustre["counters"]["POSIX_FILE_ALIGNMENT"].iloc[0]
 
                 file_count = 0
 
                 ids = dxt_posix.id.unique().tolist()
                 for id in ids:
-                    temp = dxt_posix.loc[dxt_posix['id'] == id]
-                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id]
+                    temp = dxt_posix.loc[dxt_posix["id"] == id]
+                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id]
 
                     misaligned_ranks = []
                     misaligned_ranks_opr = []
-                    
+
                     offsets = temp_df["offsets"].to_numpy().tolist()
                     rank = temp_df["rank"].to_numpy().tolist()
                     operation = temp_df["operation"].to_numpy().tolist()
@@ -441,33 +614,46 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali
                     if misaligned_ranks:
                         misaligned_rank_ind = misaligned_ranks[0]
                         misaligned_rank_opr = misaligned_ranks_opr[0]
-                        misaligned_rank_df = temp.loc[(temp['rank'] == int(misaligned_rank_ind))]
-                        if misaligned_rank_opr == 'read':
-                            misaligned_rank_df = misaligned_rank_df['read_segments'].iloc[0]
+                        misaligned_rank_df = temp.loc[
+                            (temp["rank"] == int(misaligned_rank_ind))
+                        ]
+                        if misaligned_rank_opr == "read":
+                            misaligned_rank_df = misaligned_rank_df[
+                                "read_segments"
+                            ].iloc[0]
                         else:
-                            misaligned_rank_df = misaligned_rank_df['write_segments'].iloc[0]
-                        misaligned_rank_stack_addresses = misaligned_rank_df['stack_memory_addresses'].iloc[0]
-
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                            misaligned_rank_df = misaligned_rank_df[
+                                "write_segments"
+                            ].iloc[0]
+                        misaligned_rank_stack_addresses = misaligned_rank_df[
+                            "stack_memory_addresses"
+                        ].iloc[0]
+
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(misaligned_rank_stack_addresses)
-                        backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
 
                         detail.append(
                             {
-                                'message': '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format(
+                                "message": '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format(
                                     len(misaligned_ranks),
-                                    file_map[id] if args.full_path else os.path.basename(file_map[id])
-                                ) 
+                                    file_map[id]
+                                    if args.full_path
+                                    else os.path.basename(file_map[id]),
+                                )
                             }
                         )
 
                         for index, row3 in backtrace.iterrows():
                             detail.append(
                                 {
-                                    'message': '{}: {}'.format(
-                                        row3['function_name'],
-                                        row3['line_number']
-                                    ) 
+                                    "message": "{}: {}".format(
+                                        row3["function_name"], row3["line_number"]
+                                    )
                                 }
                             )
                     file_count += 1
@@ -476,23 +662,43 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali
                 time_taken = end - start
                 detail.append(
                     {
-                        'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+                        "message": "Time taken to process this trigger: {}s".format(
+                            round(time_taken, 5)
+                        )
                     }
                 )
             recommendation.append(
                 {
-                    'message': 'Consider using a Lustre alignment that matches the file system stripe configuration',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+                    "message": "Consider using a Lustre alignment that matches the file system stripe configuration",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+            )
         )
 
 
-def check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
-    '''
+def check_traffic(
+    max_read_offset,
+    total_read_size,
+    max_write_offset,
+    total_written_size,
+    dxt_posix=None,
+    dxt_posix_read_data=None,
+    dxt_posix_write_data=None,
+):
+    """
     Check whether application has redundant read or write traffic
 
     Parameters:
@@ -500,10 +706,10 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ
         total_read_size: total size application has been read
         max_write_offset: max offset application is writing to
         total_written_size: total size application has been written
-    '''
+    """
 
     if max_read_offset > total_read_size:
-        issue = 'Application might have redundant read traffic (more data read than the highest offset)'
+        issue = "Application might have redundant read traffic (more data read than the highest offset)"
 
         detail = []
         file_count = 0
@@ -513,67 +719,79 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ
             start = time.time()
             ids = dxt_posix.id.unique().tolist()
             for id in ids:
-                if file_count < thresholds['backtrace'][0]:
-                    temp = dxt_posix.loc[dxt_posix['id'] == id]
+                if file_count < thresholds["backtrace"][0]:
+                    temp = dxt_posix.loc[dxt_posix["id"] == id]
 
                     random_ranks_ind = -1
-                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id]
+                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id]
                     updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
 
                     for i in range(len(updated_offsets)):
-                        if updated_offsets.count(updated_offsets[i]) > 1: 
+                        if updated_offsets.count(updated_offsets[i]) > 1:
                             redundant_ranks_ind = i
                             break
 
                     if random_ranks_ind != -1:
-                        random_rank = temp_df.iloc[redundant_ranks_ind]['rank']
-                        random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets']
-                        random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-
-                        temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
-                        temp_random_rank = temp_random_rank['read_segments'].iloc[0]
-                        random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
-                        random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                        random_rank = temp_df.iloc[redundant_ranks_ind]["rank"]
+                        random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"]
+                        random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+
+                        temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+                        temp_random_rank = temp_random_rank["read_segments"].iloc[0]
+                        random_stack_addresses = temp_random_rank.loc[
+                            (temp_random_rank["offset"] == random_offsets)
+                            & (temp_random_rank["start_time"] == random_start_time)
+                        ]
+                        random_stack_addresses = random_stack_addresses[
+                            "stack_memory_addresses"
+                        ].iloc[0]
+
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(random_stack_addresses)
-                        backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-                        
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
+
                         detail.append(
                             {
-                                'message': 'The backtrace information for these redundant read call(s) is given below:'
+                                "message": "The backtrace information for these redundant read call(s) is given below:"
                             }
                         )
                         for index, row3 in backtrace.iterrows():
                             detail.append(
                                 {
-                                    'message': '{}: {}'.format(
-                                        row3['function_name'],
-                                        row3['line_number']
-                                    ) 
+                                    "message": "{}: {}".format(
+                                        row3["function_name"], row3["line_number"]
+                                    )
                                 }
                             )
                         file_count += 1
                 else:
                     detail.append(
                         {
-                            'message': 'The backtrace information for this file is similar to the previous files'
+                            "message": "The backtrace information for this file is similar to the previous files"
                         }
                     )
             end = time.time()
             time_taken = end - start
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(time_taken, 5)
+                    )
                 }
             )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None)
+            message(
+                INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None
+            )
         )
 
     if max_write_offset > total_written_size:
-        issue = 'Application might have redundant write traffic (more data written than the highest offset)'
+        issue = "Application might have redundant write traffic (more data written than the highest offset)"
 
         detail = []
         file_count = 0
@@ -583,70 +801,105 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ
             start = time.time()
             ids = dxt_posix.id.unique().tolist()
             for id in ids:
-                if file_count < thresholds['backtrace'][0]:
-                    temp = dxt_posix.loc[dxt_posix['id'] == id]
+                if file_count < thresholds["backtrace"][0]:
+                    temp = dxt_posix.loc[dxt_posix["id"] == id]
 
                     random_ranks_ind = -1
-                    temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id]
+                    temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id]
                     updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
                     for i in range(len(updated_offsets)):
-                        if updated_offsets.count(updated_offsets[i]) > 1: 
+                        if updated_offsets.count(updated_offsets[i]) > 1:
                             redundant_ranks_ind = i
                             break
 
                     if random_ranks_ind != -1:
-                        random_rank = temp_df.iloc[redundant_ranks_ind]['rank']
-                        random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets']
-                        random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-
-                        temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
-                        temp_random_rank = temp_random_rank['write_segments'].iloc[0]
-                        random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
-                        random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                        random_rank = temp_df.iloc[redundant_ranks_ind]["rank"]
+                        random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"]
+                        random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+
+                        temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+                        temp_random_rank = temp_random_rank["write_segments"].iloc[0]
+                        random_stack_addresses = temp_random_rank.loc[
+                            (temp_random_rank["offset"] == random_offsets)
+                            & (temp_random_rank["start_time"] == random_start_time)
+                        ]
+                        random_stack_addresses = random_stack_addresses[
+                            "stack_memory_addresses"
+                        ].iloc[0]
+
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(random_stack_addresses)
-                        backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-                        
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
+
                         detail.append(
                             {
-                                'message': 'The backtrace information for these redundant write call(s) is given below:'
+                                "message": "The backtrace information for these redundant write call(s) is given below:"
                             }
                         )
                         for index, row3 in backtrace.iterrows():
                             detail.append(
                                 {
-                                    'message': '{}: {}'.format(
-                                        row3['function_name'],
-                                        row3['line_number']
-                                    ) 
+                                    "message": "{}: {}".format(
+                                        row3["function_name"], row3["line_number"]
+                                    )
                                 }
                             )
                         file_count += 1
                 else:
                     detail.append(
                         {
-                            'message': 'The backtrace information for this file is similar to the previous files'
+                            "message": "The backtrace information for this file is similar to the previous files"
                         }
                     )
             end = time.time()
             time_taken = end - start
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(time_taken, 5)
+                    )
                 }
             )
         insights_metadata.append(
-            message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None, detail)
+            message(
+                INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE,
+                TARGET_DEVELOPER,
+                WARN,
+                issue,
+                None,
+                detail,
+            )
         )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None)
+            message(
+                INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE,
+                TARGET_DEVELOPER,
+                WARN,
+                issue,
+                None,
+            )
         )
 
 
-def check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
-    '''
+def check_random_operation(
+    read_consecutive,
+    read_sequential,
+    read_random,
+    total_reads,
+    write_consecutive,
+    write_sequential,
+    write_random,
+    total_writes,
+    dxt_posix=None,
+    dxt_posix_read_data=None,
+    dxt_posix_write_data=None,
+):
+    """
     Check whether application has performed excessive random operations
 
     Parameters:
@@ -658,19 +911,23 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
         write_sequential: number of sequential write operations
         write_random: number of random write operations
         total_write: number of write operations been executed by the application
-    '''
+    """
 
     if total_reads:
-        if read_random and read_random / total_reads > thresholds['random_operations'][0] and read_random > thresholds['random_operations_absolute'][0]:
-            thresholds['random_operations'][1] = True
-            thresholds['random_operations_absolute'][1] = True
-            issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format(
+        if (
+            read_random
+            and read_random / total_reads > thresholds["random_operations"][0]
+            and read_random > thresholds["random_operations_absolute"][0]
+        ):
+            thresholds["random_operations"][1] = True
+            thresholds["random_operations_absolute"][1] = True
+            issue = "Application is issuing a high number ({}) of random read operations ({:.2f}%)".format(
                 read_random, read_random / total_reads * 100.0
             )
 
             recommendation = [
                 {
-                    'message': 'Consider changing your data model to have consecutive or sequential reads'
+                    "message": "Consider changing your data model to have consecutive or sequential reads"
                 }
             ]
 
@@ -679,11 +936,11 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
                 start = time.time()
                 ids = dxt_posix.id.unique().tolist()
                 for id in ids:
-                    temp = dxt_posix.loc[dxt_posix['id'] == id]
-                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id]
-                    temp_df = temp_df.sort_values('start_time', ascending=True)
+                    temp = dxt_posix.loc[dxt_posix["id"] == id]
+                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id]
+                    temp_df = temp_df.sort_values("start_time", ascending=True)
                     random_ranks_ind = -1
-                
+
                     if not temp_df["offsets"].is_monotonic_increasing:
                         updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
                         cur = 0
@@ -694,64 +951,90 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
                             cur = updated_offsets[i]
 
                     if random_ranks_ind != -1:
-                        random_rank = temp_df.iloc[random_ranks_ind]['rank']
-                        random_offsets = temp_df.iloc[random_ranks_ind]['offsets']
-                        random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-                        temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
-                        temp_random_rank = temp_random_rank['read_segments'].iloc[0]
-                        random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
-                        random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                        random_rank = temp_df.iloc[random_ranks_ind]["rank"]
+                        random_offsets = temp_df.iloc[random_ranks_ind]["offsets"]
+                        random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+                        temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+                        temp_random_rank = temp_random_rank["read_segments"].iloc[0]
+                        random_stack_addresses = temp_random_rank.loc[
+                            (temp_random_rank["offset"] == random_offsets)
+                            & (temp_random_rank["start_time"] == random_start_time)
+                        ]
+                        random_stack_addresses = random_stack_addresses[
+                            "stack_memory_addresses"
+                        ].iloc[0]
+
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(random_stack_addresses)
-                        backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
                         detail = []
                         detail.append(
                             {
-                                'message': 'The backtrace information for these random read call(s) is given below:'
+                                "message": "The backtrace information for these random read call(s) is given below:"
                             }
                         )
                         for index, row3 in backtrace.iterrows():
                             detail.append(
                                 {
-                                    'message': '{}: {}'.format(
-                                        row3['function_name'],
-                                        row3['line_number']
-                                    ) 
+                                    "message": "{}: {}".format(
+                                        row3["function_name"], row3["line_number"]
+                                    )
                                 }
                             )
                 end = time.time()
                 time_taken = end - start
                 detail.append(
                     {
-                        'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+                        "message": "Time taken to process this trigger: {}s".format(
+                            round(time_taken, 5)
+                        )
                     }
                 )
 
             insights_operation.append(
-                message(INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+                message(
+                    INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE,
+                    TARGET_DEVELOPER,
+                    HIGH,
+                    issue,
+                    recommendation,
+                )
             )
         else:
-            issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format(
+            issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests".format(
                 read_consecutive / total_reads * 100.0,
-                read_sequential / total_reads * 100.0
+                read_sequential / total_reads * 100.0,
             )
 
             insights_operation.append(
-                message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None)
+                message(
+                    INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE,
+                    TARGET_DEVELOPER,
+                    OK,
+                    issue,
+                    None,
+                )
             )
 
     if total_writes:
-        if write_random and write_random / total_writes > thresholds['random_operations'][0] and write_random > thresholds['random_operations_absolute'][0]:
-            thresholds['random_operations'][1] = True
-            thresholds['random_operations_absolute'][1] = True
-            issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format(
+        if (
+            write_random
+            and write_random / total_writes > thresholds["random_operations"][0]
+            and write_random > thresholds["random_operations_absolute"][0]
+        ):
+            thresholds["random_operations"][1] = True
+            thresholds["random_operations_absolute"][1] = True
+            issue = "Application is issuing a high number ({}) of random write operations ({:.2f}%)".format(
                 write_random, write_random / total_writes * 100.0
             )
 
             recommendation = [
                 {
-                    'message': 'Consider changing your data model to have consecutive or sequential writes'
+                    "message": "Consider changing your data model to have consecutive or sequential writes"
                 }
             ]
 
@@ -760,10 +1043,10 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
                 start = time.time()
                 ids = dxt_posix.id.unique().tolist()
                 for id in ids:
-                    temp = dxt_posix.loc[dxt_posix['id'] == id]
+                    temp = dxt_posix.loc[dxt_posix["id"] == id]
 
-                    temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id]
-                    temp_df.sort_values('start_time', ascending=True, inplace=True)
+                    temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id]
+                    temp_df.sort_values("start_time", ascending=True, inplace=True)
                     random_ranks_ind = -1
                     if not temp_df["offsets"].is_monotonic_increasing:
                         updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
@@ -775,58 +1058,87 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
                             cur = updated_offsets[i]
 
                     if random_ranks_ind != -1:
-                        random_rank = temp_df.iloc[random_ranks_ind]['rank']
-                        random_offsets = temp_df.iloc[random_ranks_ind]['offsets']
-                        random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-                        
-                        temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
-                        temp_random_rank = temp_random_rank['write_segments'].iloc[0]
-                        random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
-                        random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                        random_rank = temp_df.iloc[random_ranks_ind]["rank"]
+                        random_offsets = temp_df.iloc[random_ranks_ind]["offsets"]
+                        random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+
+                        temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+                        temp_random_rank = temp_random_rank["write_segments"].iloc[0]
+                        random_stack_addresses = temp_random_rank.loc[
+                            (temp_random_rank["offset"] == random_offsets)
+                            & (temp_random_rank["start_time"] == random_start_time)
+                        ]
+                        random_stack_addresses = random_stack_addresses[
+                            "stack_memory_addresses"
+                        ].iloc[0]
+
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(random_stack_addresses)
-                        backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
                         detail = []
                         detail.append(
                             {
-                                'message': 'The backtrace information for these random write call(s) is given below:'
+                                "message": "The backtrace information for these random write call(s) is given below:"
                             }
                         )
                         for index, row3 in backtrace.iterrows():
                             detail.append(
                                 {
-                                    'message': '{}: {}'.format(
-                                        row3['function_name'],
-                                        row3['line_number']
-                                    ) 
+                                    "message": "{}: {}".format(
+                                        row3["function_name"], row3["line_number"]
+                                    )
                                 }
                             )
-                
+
                 end = time.time()
                 time_taken = end - start
                 detail.append(
                     {
-                        'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+                        "message": "Time taken to process this trigger: {}s".format(
+                            round(time_taken, 5)
+                        )
                     }
                 )
 
             insights_operation.append(
-                message(INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+                message(
+                    INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE,
+                    TARGET_DEVELOPER,
+                    HIGH,
+                    issue,
+                    recommendation,
+                )
             )
         else:
-            issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format(
+            issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests".format(
                 write_consecutive / total_writes * 100.0,
-                write_sequential / total_writes * 100.0
+                write_sequential / total_writes * 100.0,
             )
 
             insights_operation.append(
-                message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None)
+                message(
+                    INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE,
+                    TARGET_DEVELOPER,
+                    OK,
+                    issue,
+                    None,
+                )
             )
 
 
-def check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map):
-    '''
+def check_shared_small_operation(
+    total_shared_reads,
+    total_shared_reads_small,
+    total_shared_writes,
+    total_shared_writes_small,
+    shared_files,
+    file_map,
+):
+    """
     Check whether there are excessive small requests in shared files
 
     Parameters:
@@ -838,113 +1150,182 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t
             small reads an small writes in each shared file
             required columns: ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES']
         file_map: file id and file name pairing
-    '''
-
-    if total_shared_reads and total_shared_reads_small / total_shared_reads > thresholds['small_requests'][0] and total_shared_reads_small > thresholds['small_requests_absolute'][0]:
-        thresholds['small_requests'][1] = True
-        thresholds['small_requests_absolute'][1] = True
-        issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format(
-            total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0
+    """
+
+    if (
+        total_shared_reads
+        and total_shared_reads_small / total_shared_reads
+        > thresholds["small_requests"][0]
+        and total_shared_reads_small > thresholds["small_requests_absolute"][0]
+    ):
+        thresholds["small_requests"][1] = True
+        thresholds["small_requests_absolute"][1] = True
+        issue = "Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests".format(
+            total_shared_reads_small,
+            total_shared_reads_small / total_shared_reads * 100.0,
         )
 
         detail = []
 
         for index, row in shared_files.iterrows():
-            if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * thresholds['small_requests'][0] / 2):
+            if row["INSIGHTS_POSIX_SMALL_READS"] > (
+                total_shared_reads * thresholds["small_requests"][0] / 2
+            ):
                 detail.append(
                     {
-                        'message': '{} ({:.2f}%) small read requests are to "{}"'.format(
-                            row['INSIGHTS_POSIX_SMALL_READS'],
-                            row['INSIGHTS_POSIX_SMALL_READS'] / total_shared_reads * 100.0,
-                            file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                        ) 
+                        "message": '{} ({:.2f}%) small read requests are to "{}"'.format(
+                            row["INSIGHTS_POSIX_SMALL_READS"],
+                            row["INSIGHTS_POSIX_SMALL_READS"]
+                            / total_shared_reads
+                            * 100.0,
+                            file_map[int(row["id"])]
+                            if args.full_path
+                            else os.path.basename(file_map[int(row["id"])]),
+                        )
                     }
                 )
 
         recommendation = [
             {
-                'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default')
+                "message": "Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/mpi-io-collective-read.c"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             }
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
-    if total_shared_writes and total_shared_writes_small / total_shared_writes > thresholds['small_requests'][0] and total_shared_writes_small > thresholds['small_requests_absolute'][0]:
-        thresholds['small_requests'][1] = True
-        thresholds['small_requests_absolute'][1] = True
-        issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format(
-            total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0
+    if (
+        total_shared_writes
+        and total_shared_writes_small / total_shared_writes
+        > thresholds["small_requests"][0]
+        and total_shared_writes_small > thresholds["small_requests_absolute"][0]
+    ):
+        thresholds["small_requests"][1] = True
+        thresholds["small_requests_absolute"][1] = True
+        issue = "Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests".format(
+            total_shared_writes_small,
+            total_shared_writes_small / total_shared_writes * 100.0,
         )
 
         detail = []
 
         for index, row in shared_files.iterrows():
-            if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * thresholds['small_requests'][0] / 2):
+            if row["INSIGHTS_POSIX_SMALL_WRITES"] > (
+                total_shared_writes * thresholds["small_requests"][0] / 2
+            ):
                 detail.append(
                     {
-                        'message': '{} ({:.2f}%) small writes requests are to "{}"'.format(
-                            row['INSIGHTS_POSIX_SMALL_WRITES'],
-                            row['INSIGHTS_POSIX_SMALL_WRITES'] / total_shared_writes * 100.0,
-                            file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                        ) 
+                        "message": '{} ({:.2f}%) small writes requests are to "{}"'.format(
+                            row["INSIGHTS_POSIX_SMALL_WRITES"],
+                            row["INSIGHTS_POSIX_SMALL_WRITES"]
+                            / total_shared_writes
+                            * 100.0,
+                            file_map[int(row["id"])]
+                            if args.full_path
+                            else os.path.basename(file_map[int(row["id"])]),
+                        )
                     }
                 )
 
         recommendation = [
             {
-                'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default')
+                "message": "Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/mpi-io-collective-write.c"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             }
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
 
 def check_long_metadata(count_long_metadata, modules):
-    '''
+    """
     Check how many ranks have metadata operations taking too long
 
     Parameters:
         count_long_metadata: number of ranks that have metadata operations taking too long
         modules: all different mudules been used in the application
-    '''
+    """
 
     if count_long_metadata > 0:
-        thresholds['metadata_time_rank'][1] = True
-        issue = 'There are {} ranks where metadata operations take over {} seconds'.format(
-            count_long_metadata, thresholds['metadata_time_rank'][0]
+        thresholds["metadata_time_rank"][1] = True
+        issue = (
+            "There are {} ranks where metadata operations take over {} seconds".format(
+                count_long_metadata, thresholds["metadata_time_rank"][0]
+            )
         )
 
         recommendation = [
             {
-                'message': 'Attempt to combine files, reduce, or cache metadata operations'
+                "message": "Attempt to combine files, reduce, or cache metadata operations"
             }
         ]
 
-        if 'HF5' in modules:
+        if "HF5" in modules:
             recommendation.append(
                 {
-                    'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default')
+                    "message": "Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/hdf5-collective-metadata.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 },
                 {
-                    'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default')
-                }
+                    "message": "Since your appplication uses HDF5, try using metadata cache to defer metadata operations",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/hdf5-cache.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
+                },
             )
 
         insights_metadata.append(
-            message(INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_POSIX_HIGH_METADATA_TIME,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+            )
         )
 
 
-def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
-    '''
+def check_shared_data_imblance(
+    stragglers_count,
+    detected_files,
+    file_map,
+    dxt_posix=None,
+    dxt_posix_read_data=None,
+    dxt_posix_write_data=None,
+):
+    """
     Check how many shared files containing data transfer imbalance
 
     Parameters:
@@ -953,11 +1334,11 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p
             data imbalance per file
             required columns: ['id', 'data_imbalance']
         file_map: file id and file name pairing
-    '''
+    """
 
     if stragglers_count:
-        thresholds['imbalance_stragglers'][1] = True
-        issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format(
+        thresholds["imbalance_stragglers"][1] = True
+        issue = "Detected data transfer imbalance caused by stragglers when accessing {} shared file.".format(
             stragglers_count
         )
 
@@ -968,52 +1349,73 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p
         for index, row in detected_files.iterrows():
             detail.append(
                 {
-                    'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
-                        row['data_imbalance'],
-                        file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                    ) 
+                    "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+                        row["data_imbalance"],
+                        file_map[int(row["id"])]
+                        if args.full_path
+                        else os.path.basename(file_map[int(row["id"])]),
+                    )
                 }
             )
 
             # DXT Analysis
             if args.backtrace:
                 start = time.time()
-                if file_count < thresholds['backtrace'][0]:
-                    temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
-                    temp_df_1 = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])]
-                    temp_df_2 = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])]
-
-                    df_merged = pd.concat([temp_df_1, temp_df_2], ignore_index=True, sort=False)
-                    df_merged['duration'] = df_merged['end_time'] - df_merged['start_time']
-                    df_merged.sort_values('duration', ascending=True, inplace=True)
+                if file_count < thresholds["backtrace"][0]:
+                    temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+                    temp_df_1 = dxt_posix_write_data.loc[
+                        dxt_posix_write_data["id"] == int(row["id"])
+                    ]
+                    temp_df_2 = dxt_posix_read_data.loc[
+                        dxt_posix_read_data["id"] == int(row["id"])
+                    ]
+
+                    df_merged = pd.concat(
+                        [temp_df_1, temp_df_2], ignore_index=True, sort=False
+                    )
+                    df_merged["duration"] = (
+                        df_merged["end_time"] - df_merged["start_time"]
+                    )
+                    df_merged.sort_values("duration", ascending=True, inplace=True)
                     df_merged = df_merged.iloc[0]
-                    rank_df = temp.loc[(temp['rank'] == int(df_merged['rank']))]
-
-                    if df_merged['operation'] == 'write':
-                        rank_df = rank_df['write_segments'].iloc[0]
-                        stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                    rank_df = temp.loc[(temp["rank"] == int(df_merged["rank"]))]
+
+                    if df_merged["operation"] == "write":
+                        rank_df = rank_df["write_segments"].iloc[0]
+                        stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[
+                            0
+                        ]
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(stack_memory_addresses)
-                        backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
                     else:
-                        rank_df = rank_df['read_segments'].iloc[0]
-                        stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
-                        address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                        rank_df = rank_df["read_segments"].iloc[0]
+                        stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[
+                            0
+                        ]
+                        address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                         res = set(list(address)) & set(stack_memory_addresses)
-                        backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                        backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                            dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+                                res
+                            )
+                        ]
 
                     detail.append(
                         {
-                            'message': 'The backtrace information for these imbalanced call(s) is given below:'
+                            "message": "The backtrace information for these imbalanced call(s) is given below:"
                         }
                     )
                     for index, row3 in backtrace.iterrows():
                         detail.append(
                             {
-                                'message': '{}: {}'.format(
-                                    row3['function_name'],
-                                    row3['line_number']
-                                ) 
+                                "message": "{}: {}".format(
+                                    row3["function_name"], row3["line_number"]
+                                )
                             }
                         )
 
@@ -1021,69 +1423,94 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p
                 else:
                     detail.append(
                         {
-                            'message': 'The backtrace information for this file is similar to the previous files'
+                            "message": "The backtrace information for this file is similar to the previous files"
                         }
                     )
-                
+
                 end = time.time()
                 time_taken = end - start
                 dxt_trigger_time += time_taken
-        
-        if dxt_trigger_time > 0:            
+
+        if dxt_trigger_time > 0:
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(dxt_trigger_time, 5)
+                    )
                 }
             )
 
         recommendation = [
             {
-                'message': 'Consider better balancing the data transfer between the application ranks'
+                "message": "Consider better balancing the data transfer between the application ranks"
             },
             {
-                'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
-            }
+                "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_SIZE_IMBALANCE,
+                TARGET_USER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
 
-def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size):
-    '''
+def check_shared_data_imblance_split(
+    slowest_rank_bytes, fastest_rank_bytes, total_transfer_size
+):
+    """
     Check whether the specific shared file contains data imbalance
 
     Parameters:
         slowest_rank_bytes: the total request size of the rank that takes the longest data operation time
         fastest_rank_bytes: the total request size of the rank that takes the shortest data operation time
         total_transfer_size: total request size of that specific shared file
-    '''
-
-    if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]:
-        thresholds['imbalance_stragglers'][1] = True
-        issue = 'Load imbalance of {:.2f}% detected'.format(
+    """
+
+    if (
+        total_transfer_size
+        and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size
+        > thresholds["imbalance_stragglers"][0]
+    ):
+        thresholds["imbalance_stragglers"][1] = True
+        issue = "Load imbalance of {:.2f}% detected".format(
             abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100
         )
 
         recommendation = [
             {
-                'message': 'Consider better balancing the data transfer between the application ranks'
+                "message": "Consider better balancing the data transfer between the application ranks"
             },
             {
-                'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
-            }
+                "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation
+            )
         )
 
 
 def check_shared_time_imbalance(stragglers_count, detected_files, file_map):
-    '''
+    """
     Check how many shared files containing time transfer imbalance
 
     Parameters:
@@ -1092,74 +1519,101 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map):
             data imbalance per file
             required columns: ['id', 'time_imbalance']
         file_map: file id and file name pairing
-    '''
+    """
 
     if stragglers_count:
-        thresholds['imbalance_stragglers'][1] = True
-        issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format(
+        thresholds["imbalance_stragglers"][1] = True
+        issue = "Detected time imbalance caused by stragglers when accessing {} shared file.".format(
             stragglers_count
         )
 
         detail = []
-        
+
         for index, row in detected_files.iterrows():
             detail.append(
                 {
-                    'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
-                        row['time_imbalance'],
-                        file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                    ) 
+                    "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+                        row["time_imbalance"],
+                        file_map[int(row["id"])]
+                        if args.full_path
+                        else os.path.basename(file_map[int(row["id"])]),
+                    )
                 }
             )
 
         recommendation = [
             {
-                'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give
+                "message": "Consider better distributing the data in the parallel file system"  # needs to review what suggestion to give
             },
             {
-                'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
-            }
+                "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_TIME_IMBALANCE,
+                TARGET_USER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
 
-def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time):
-    '''
+def check_shared_time_imbalance_split(
+    slowest_rank_time, fastest_rank_time, total_transfer_time
+):
+    """
     Check whether the specific shared file contains time imbalance
 
     Parameters:
         slowest_rank_bytes: the total request time of the rank that takes the longest data operation time
         fastest_rank_bytes: the total request time of the rank that takes the shortest data operation time
         total_transfer_size: total request time of that specific shared file
-    '''
-
-    if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]:
-        thresholds['imbalance_stragglers'][1] = True
-        issue = 'Load imbalance of {:.2f}% detected'.format(
+    """
+
+    if (
+        total_transfer_time
+        and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time
+        > thresholds["imbalance_stragglers"][0]
+    ):
+        thresholds["imbalance_stragglers"][1] = True
+        issue = "Load imbalance of {:.2f}% detected".format(
             abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100
         )
 
         recommendation = [
             {
-                'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give
+                "message": "Consider better distributing the data in the parallel file system"  # needs to review what suggestion to give
             },
             {
-                'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
-            }
+                "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation
+            )
         )
 
 
-def check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None):
-    '''
+def check_individual_write_imbalance(
+    imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None
+):
+    """
     Check how many write imbalance when accessing individual files
 
     Parameters:
@@ -1167,57 +1621,62 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map,
         detected_files:
             write imbalance per file
             required columns: ['id', 'write_imbalance']
-    '''
+    """
 
     if imbalance_count:
-        thresholds['imbalance_size'][1] = True
-        issue = 'Detected write imbalance when accessing {} individual files'.format(
+        thresholds["imbalance_size"][1] = True
+        issue = "Detected write imbalance when accessing {} individual files".format(
             imbalance_count
         )
 
         detail = []
         file_count = 0
         dxt_trigger_time = 0
-        
+
         for index, row in detected_files.iterrows():
             detail.append(
                 {
-                    'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
-                        row['write_imbalance'],
-                        file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                    ) 
+                    "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+                        row["write_imbalance"],
+                        file_map[int(row["id"])]
+                        if args.full_path
+                        else os.path.basename(file_map[int(row["id"])]),
+                    )
                 }
             )
 
             # DXT Analysis
             if args.backtrace:
                 start = time.time()
-                if file_count < thresholds['backtrace'][0]:
-                    temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
-                    temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])]
-
-                    maxClm = temp_df['length'].max()
-                    temp_df = temp_df.loc[(temp_df['length'] == maxClm)]
-                    rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))]
-
-                    rank_df = rank_df['write_segments'].iloc[0]
-                    stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
-                    address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                if file_count < thresholds["backtrace"][0]:
+                    temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+                    temp_df = dxt_posix_write_data.loc[
+                        dxt_posix_write_data["id"] == int(row["id"])
+                    ]
+
+                    maxClm = temp_df["length"].max()
+                    temp_df = temp_df.loc[(temp_df["length"] == maxClm)]
+                    rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))]
+
+                    rank_df = rank_df["write_segments"].iloc[0]
+                    stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0]
+                    address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                     res = set(list(address)) & set(stack_memory_addresses)
-                    backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                    backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                        dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res)
+                    ]
 
                     detail.append(
                         {
-                            'message': 'The backtrace information for these imbalanced write call(s) is given below:'
+                            "message": "The backtrace information for these imbalanced write call(s) is given below:"
                         }
                     )
                     for index, row3 in backtrace.iterrows():
                         detail.append(
                             {
-                                'message': '{}: {}'.format(
-                                    row3['function_name'],
-                                    row3['line_number']
-                                ) 
+                                "message": "{}: {}".format(
+                                    row3["function_name"], row3["line_number"]
+                                )
                             }
                         )
 
@@ -1225,82 +1684,119 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map,
                 else:
                     detail.append(
                         {
-                            'message': 'The backtrace information for this file is similar to the previous files'
+                            "message": "The backtrace information for this file is similar to the previous files"
                         }
-                    )    
-                 
+                    )
+
                 end = time.time()
                 time_taken = end - start
-                dxt_trigger_time  += time_taken
-        
-        if dxt_trigger_time > 0:        
+                dxt_trigger_time += time_taken
+
+        if dxt_trigger_time > 0:
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(dxt_trigger_time, 5)
+                    )
                 }
             )
 
         recommendation = [
             {
-                'message': 'Consider better balancing the data transfer between the application ranks'
+                "message": "Consider better balancing the data transfer between the application ranks"
             },
             {
-                'message': 'Consider tuning the stripe size and count to better distribute the data',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+                "message": "Consider tuning the stripe size and count to better distribute the data",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+                "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
-            }
+                "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
 
 def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written):
-    '''
+    """
     Check whether there is write imbalance in the specific individual file
 
     Parameters:
         max_bytes_written: max byte written in the file
         min_bytes_written: minimum byte written in the file
-    '''
-
-    if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]:
-        thresholds['imbalance_size'][1] = True
-        issue = 'Load imbalance of {:.2f}% detected'.format(
-            abs(max_bytes_written - min_bytes_written) / max_bytes_written  * 100
+    """
+
+    if (
+        max_bytes_written
+        and abs(max_bytes_written - min_bytes_written) / max_bytes_written
+        > thresholds["imbalance_size"][0]
+    ):
+        thresholds["imbalance_size"][1] = True
+        issue = "Load imbalance of {:.2f}% detected".format(
+            abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100
         )
 
         recommendation = [
             {
-                'message': 'Consider better balancing the data transfer between the application ranks'
+                "message": "Consider better balancing the data transfer between the application ranks"
             },
             {
-                'message': 'Consider tuning the stripe size and count to better distribute the data',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+                "message": "Consider tuning the stripe size and count to better distribute the data",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+                "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
-            }
+                "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+            )
         )
 
 
-def check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None):
-    '''
+def check_individual_read_imbalance(
+    imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None
+):
+    """
     Check how many read imbalance when accessing individual files
 
     Parameters:
@@ -1308,57 +1804,62 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d
         detected_files:
             read imbalance per file
             required columns: ['id', 'read_imbalance']
-    '''
+    """
 
     if imbalance_count:
-        thresholds['imbalance_size'][1] = True
-        issue = 'Detected read imbalance when accessing {} individual files.'.format(
+        thresholds["imbalance_size"][1] = True
+        issue = "Detected read imbalance when accessing {} individual files.".format(
             imbalance_count
         )
 
         detail = []
         file_count = 0
         dxt_trigger_time = 0
-        
+
         for index, row in detected_files.iterrows():
             detail.append(
                 {
-                    'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
-                        row['read_imbalance'],
-                        file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                    ) 
+                    "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+                        row["read_imbalance"],
+                        file_map[int(row["id"])]
+                        if args.full_path
+                        else os.path.basename(file_map[int(row["id"])]),
+                    )
                 }
             )
 
             # DXT Analysis
             if args.backtrace:
                 start = time.time()
-                if file_count < thresholds['backtrace'][0]:
-                    temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
-                    temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])]
-
-                    maxClm = temp_df['length'].max()
-                    temp_df = temp_df.loc[(temp_df['length'] == maxClm)]
-                    rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))]
-
-                    rank_df = rank_df['read_segments'].iloc[0]
-                    stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
-                    address = dxt_posix.iloc[0]['address_line_mapping']['address']
+                if file_count < thresholds["backtrace"][0]:
+                    temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+                    temp_df = dxt_posix_read_data.loc[
+                        dxt_posix_read_data["id"] == int(row["id"])
+                    ]
+
+                    maxClm = temp_df["length"].max()
+                    temp_df = temp_df.loc[(temp_df["length"] == maxClm)]
+                    rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))]
+
+                    rank_df = rank_df["read_segments"].iloc[0]
+                    stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0]
+                    address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
                     res = set(list(address)) & set(stack_memory_addresses)
-                    backtrace  =  dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+                    backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+                        dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res)
+                    ]
 
                     detail.append(
                         {
-                            'message': 'The backtrace information for these imbalanced read call(s) is given below:'
+                            "message": "The backtrace information for these imbalanced read call(s) is given below:"
                         }
                     )
                     for index, row3 in backtrace.iterrows():
                         detail.append(
                             {
-                                'message': '{}: {}'.format(
-                                    row3['function_name'],
-                                    row3['line_number']
-                                ) 
+                                "message": "{}: {}".format(
+                                    row3["function_name"], row3["line_number"]
+                                )
                             }
                         )
 
@@ -1366,84 +1867,126 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d
                 else:
                     detail.append(
                         {
-                            'message': 'The backtrace information for this file is similar to the previous files'
+                            "message": "The backtrace information for this file is similar to the previous files"
                         }
                     )
                 end = time.time()
                 time_taken = end - start
                 dxt_trigger_time += time_taken
 
-        if dxt_trigger_time > 0:      
+        if dxt_trigger_time > 0:
             detail.append(
                 {
-                    'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                    "message": "Time taken to process this trigger: {}s".format(
+                        round(dxt_trigger_time, 5)
+                    )
                 }
             )
 
         recommendation = [
             {
-                'message': 'Consider better balancing the data transfer between the application ranks'
+                "message": "Consider better balancing the data transfer between the application ranks"
             },
             {
-                'message': 'Consider tuning the stripe size and count to better distribute the data',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+                "message": "Consider tuning the stripe size and count to better distribute the data",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+                "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
-            }
+                "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+            message(
+                INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+                detail,
+            )
         )
 
 
 def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read):
-    '''
+    """
     Check whether there is read imbalance in the specific individual file
 
     Parameters:
         max_bytes_written: max byte read in the file
         min_bytes_written: minimum byte read in the file
-    '''
-
-    if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]:
-        thresholds['imbalance_size'][1] = True
-        issue = 'Load imbalance of {:.2f}% detected'.format(
-            abs(max_bytes_read - min_bytes_read) / max_bytes_read  * 100
+    """
+
+    if (
+        max_bytes_read
+        and abs(max_bytes_read - min_bytes_read) / max_bytes_read
+        > thresholds["imbalance_size"][0]
+    ):
+        thresholds["imbalance_size"][1] = True
+        issue = "Load imbalance of {:.2f}% detected".format(
+            abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100
         )
 
         recommendation = [
             {
-                'message': 'Consider better balancing the data transfer between the application ranks'
+                "message": "Consider better balancing the data transfer between the application ranks"
             },
             {
-                'message': 'Consider tuning the stripe size and count to better distribute the data',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+                "message": "Consider tuning the stripe size and count to better distribute the data",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/lustre-striping.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+                "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             },
             {
-                'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
-            }
+                "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+            },
         ]
 
         insights_operation.append(
-            message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE,
+                TARGET_DEVELOPER,
+                HIGH,
+                issue,
+                recommendation,
+            )
         )
 
 
 # MPIIO level check
 
 
-def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio=None):
-    '''
+def check_mpi_collective_read_operation(
+    mpiio_coll_reads,
+    mpiio_indep_reads,
+    total_mpiio_read_operations,
+    detected_files,
+    file_map,
+    dxt_mpiio=None,
+):
+    """
     Check whether application uses collective mpi read calls
 
     Parameters:
@@ -1454,14 +1997,17 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot
             independent read operations and percentage per file
             required columns: ['id', 'absolute_indep_reads', 'percent_indep_reads']
         file_map: file id and file name pairing
-    '''
+    """
 
     if mpiio_coll_reads == 0:
-        if total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]:
-            thresholds['collective_operations_absolute'][1] = True
-            issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format(
-                mpiio_indep_reads,
-                mpiio_indep_reads / total_mpiio_read_operations * 100
+        if (
+            total_mpiio_read_operations
+            and total_mpiio_read_operations
+            > thresholds["collective_operations_absolute"][0]
+        ):
+            thresholds["collective_operations_absolute"][1] = True
+            issue = "Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls".format(
+                mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100
             )
 
             detail = []
@@ -1471,63 +2017,80 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot
             for index, row in detected_files.iterrows():
                 detail.append(
                     {
-                        'message': '{} ({}%) of independent reads to "{}"'.format(
-                            row['absolute_indep_reads'],
-                            row['percent_indep_reads'],
-                            file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                        ) 
+                        "message": '{} ({}%) of independent reads to "{}"'.format(
+                            row["absolute_indep_reads"],
+                            row["percent_indep_reads"],
+                            file_map[int(row["id"])]
+                            if args.full_path
+                            else os.path.basename(file_map[int(row["id"])]),
+                        )
                     }
                 )
 
                 # DXT Analysis
                 if args.backtrace:
                     start = time.time()
-                    temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)]
-                    temp = temp['read_segments'].iloc[0]
-                    stack_memory_addresses = temp['stack_memory_addresses'].iloc[0]
-                    address = dxt_mpiio.iloc[0]['address_line_mapping']['address']
+                    temp = dxt_mpiio.loc[
+                        (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1)
+                    ]
+                    temp = temp["read_segments"].iloc[0]
+                    stack_memory_addresses = temp["stack_memory_addresses"].iloc[0]
+                    address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"]
                     res = set(list(address)) & set(stack_memory_addresses)
-                    backtrace  =  dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)]
+                    backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[
+                        dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res)
+                    ]
                     detail.append(
                         {
-                            'message': 'The backtrace information for these read call(s) is given below:'
+                            "message": "The backtrace information for these read call(s) is given below:"
                         }
                     )
                     for index, row3 in backtrace.iterrows():
                         detail.append(
                             {
-                                'message': '{}: {}'.format(
-                                    row3['function_name'],
-                                    row3['line_number']
-                                ) 
+                                "message": "{}: {}".format(
+                                    row3["function_name"], row3["line_number"]
+                                )
                             }
                         )
-        
+
                     end = time.time()
                     time_taken = end - start
                     dxt_trigger_time += time_taken
-        
-            if dxt_trigger_time > 0:            
+
+            if dxt_trigger_time > 0:
                 detail.append(
                     {
-                        'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                        "message": "Time taken to process this trigger: {}s".format(
+                            round(dxt_trigger_time, 5)
+                        )
                     }
                 )
 
             recommendation = [
                 {
-                    'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default')
+                    "message": "Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/mpi-io-collective-read.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             ]
 
             insights_operation.append(
-                message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+                message(
+                    INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE,
+                    TARGET_DEVELOPER,
+                    HIGH,
+                    issue,
+                    recommendation,
+                    detail,
+                )
             )
     else:
-        issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format(
-            mpiio_coll_reads,
-            mpiio_coll_reads / total_mpiio_read_operations * 100
+        issue = "Application uses MPI-IO and read data using {} ({:.2f}%) collective operations".format(
+            mpiio_coll_reads, mpiio_coll_reads / total_mpiio_read_operations * 100
         )
 
         insights_operation.append(
@@ -1535,8 +2098,15 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot
         )
 
 
-def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio=None):
-    '''
+def check_mpi_collective_write_operation(
+    mpiio_coll_writes,
+    mpiio_indep_writes,
+    total_mpiio_write_operations,
+    detected_files,
+    file_map,
+    dxt_mpiio=None,
+):
+    """
     Check whether application uses collective mpi write calls
 
     Parameters:
@@ -1547,14 +2117,18 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes,
             independent write operations and percentage per file
             required columns: ['id', 'absolute_indep_writes', 'percent_indep_writes']
         file_map: file id and file name pairing
-    '''
+    """
 
     if mpiio_coll_writes == 0:
-        if total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]:
-            thresholds['collective_operations_absolute'][1] = True
-            issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format(
+        if (
+            total_mpiio_write_operations
+            and total_mpiio_write_operations
+            > thresholds["collective_operations_absolute"][0]
+        ):
+            thresholds["collective_operations_absolute"][1] = True
+            issue = "Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls".format(
                 mpiio_indep_writes,
-                mpiio_indep_writes / total_mpiio_write_operations * 100
+                mpiio_indep_writes / total_mpiio_write_operations * 100,
             )
 
             detail = []
@@ -1564,62 +2138,79 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes,
             for index, row in detected_files.iterrows():
                 detail.append(
                     {
-                        'message': '{} ({}%) independent writes to "{}"'.format(
-                            row['absolute_indep_writes'],
-                            row['percent_indep_writes'],
-                            file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
-                        ) 
+                        "message": '{} ({}%) independent writes to "{}"'.format(
+                            row["absolute_indep_writes"],
+                            row["percent_indep_writes"],
+                            file_map[int(row["id"])]
+                            if args.full_path
+                            else os.path.basename(file_map[int(row["id"])]),
+                        )
                     }
                 )
 
                 # DXT Analysis
                 if args.backtrace:
                     start = time.time()
-                    temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)]
-                    temp = temp['write_segments'].iloc[0]
-                    stack_memory_addresses = temp['stack_memory_addresses'].iloc[0]
-                    address = dxt_mpiio.iloc[0]['address_line_mapping']['address']
+                    temp = dxt_mpiio.loc[
+                        (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1)
+                    ]
+                    temp = temp["write_segments"].iloc[0]
+                    stack_memory_addresses = temp["stack_memory_addresses"].iloc[0]
+                    address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"]
                     res = set(list(address)) & set(stack_memory_addresses)
-                    backtrace  =  dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)]
+                    backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[
+                        dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res)
+                    ]
                     detail.append(
                         {
-                            'message': 'The backtrace information for these write call(s) is given below:'
+                            "message": "The backtrace information for these write call(s) is given below:"
                         }
                     )
                     for index, row3 in backtrace.iterrows():
                         detail.append(
                             {
-                                'message': '{}: {}'.format(
-                                    row3['function_name'],
-                                    row3['line_number']
-                                ) 
+                                "message": "{}: {}".format(
+                                    row3["function_name"], row3["line_number"]
+                                )
                             }
                         )
 
                     end = time.time()
                     time_taken = end - start
                     dxt_trigger_time += time_taken
-            
+
             if dxt_trigger_time > 0:
                 detail.append(
                     {
-                        'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+                        "message": "Time taken to process this trigger: {}s".format(
+                            round(dxt_trigger_time, 5)
+                        )
                     }
                 )
             recommendation = [
                 {
-                    'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default')
+                    "message": "Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/mpi-io-collective-write.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             ]
 
             insights_operation.append(
-                message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+                message(
+                    INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE,
+                    TARGET_DEVELOPER,
+                    HIGH,
+                    issue,
+                    recommendation,
+                    detail,
+                )
             )
     else:
-        issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format(
-            mpiio_coll_writes,
-            mpiio_coll_writes / total_mpiio_write_operations * 100
+        issue = "Application uses MPI-IO and write data using {} ({:.2f}%) collective operations".format(
+            mpiio_coll_writes, mpiio_coll_writes / total_mpiio_write_operations * 100
         )
 
         insights_operation.append(
@@ -1627,8 +2218,10 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes,
         )
 
 
-def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules):
-    '''
+def check_mpi_none_block_operation(
+    mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules
+):
+    """
     Check whether application can benefit from non-blocking requests
 
     Parameters:
@@ -1636,93 +2229,131 @@ def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_ext
         mpiio_nb_writes: number of non-blocking mpi write operations
         has_hdf5_extension: boolean value of whether the file in in hdf5 extension
         modules: all different mudules been used in the application
-    '''
+    """
 
     if mpiio_nb_reads == 0:
-        issue = 'Application could benefit from non-blocking (asynchronous) reads'
+        issue = "Application could benefit from non-blocking (asynchronous) reads"
 
         recommendation = []
 
-        if 'H5F' in modules or has_hdf5_extension:
+        if "H5F" in modules or has_hdf5_extension:
             recommendation.append(
                 {
-                    'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default')
+                    "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/hdf5-vol-async-read.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
 
-        if 'MPI-IO' in modules:
+        if "MPI-IO" in modules:
             recommendation.append(
                 {
-                    'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default')
+                    "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations",  # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())',
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/mpi-io-iread.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
 
         insights_operation.append(
-            message(INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation)
+            message(
+                INSIGHTS_MPI_IO_BLOCKING_READ_USAGE,
+                TARGET_DEVELOPER,
+                WARN,
+                issue,
+                recommendation,
+            )
         )
 
     if mpiio_nb_writes == 0:
-        issue = 'Application could benefit from non-blocking (asynchronous) writes'
+        issue = "Application could benefit from non-blocking (asynchronous) writes"
 
         recommendation = []
 
-        if 'H5F' in modules or has_hdf5_extension:
+        if "H5F" in modules or has_hdf5_extension:
             recommendation.append(
                 {
-                    'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default')
+                    "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)",
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/hdf5-vol-async-write.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
 
-        if 'MPI-IO' in modules:
+        if "MPI-IO" in modules:
             recommendation.append(
                 {
-                    'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations',  # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())',
-                    'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default')
+                    "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations",  # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())',
+                    "sample": Syntax.from_path(
+                        os.path.join(ROOT, "snippets/mpi-io-iwrite.c"),
+                        line_numbers=True,
+                        background_color="default",
+                    ),
                 }
             )
 
         insights_operation.append(
-            message(INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation)
+            message(
+                INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE,
+                TARGET_DEVELOPER,
+                WARN,
+                issue,
+                recommendation,
+            )
         )
 
 
 def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES):
-    '''
+    """
     Check whether application has used inter-node aggregators
 
     Parameters:
-        cb_nodes: 
+        cb_nodes:
         NUMBER_OF_COMPUTE_NODES:
-    '''
+    """
 
     if cb_nodes > NUMBER_OF_COMPUTE_NODES:
-        issue = 'Application is using inter-node aggregators (which require network communication)'
+        issue = "Application is using inter-node aggregators (which require network communication)"
 
         recommendation = [
             {
-                'message': 'Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})'.format(
+                "message": "Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})".format(
                     NUMBER_OF_COMPUTE_NODES
                 ),
-                'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-hints.bash'), line_numbers=True, background_color='default')
+                "sample": Syntax.from_path(
+                    os.path.join(ROOT, "snippets/mpi-io-hints.bash"),
+                    line_numbers=True,
+                    background_color="default",
+                ),
             }
         ]
 
         insights_operation.append(
-            message(INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation)
+            message(
+                INSIGHTS_MPI_IO_AGGREGATORS_INTER,
+                TARGET_USER,
+                HIGH,
+                issue,
+                recommendation,
+            )
         )
 
     if cb_nodes < NUMBER_OF_COMPUTE_NODES:
-        issue = 'Application is using intra-node aggregators'
+        issue = "Application is using intra-node aggregators"
 
         insights_operation.append(
             message(INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue)
         )
 
     if cb_nodes == NUMBER_OF_COMPUTE_NODES:
-        issue = 'Application is using one aggregator per compute node'
+        issue = "Application is using one aggregator per compute node"
 
         insights_operation.append(
             message(INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue)
@@ -1731,65 +2362,75 @@ def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES):
 
 # Layout and export
 
+
 def display_content(console):
     if insights_metadata:
         console.print(
             Panel(
-                Padding(
-                    Group(
-                        *insights_metadata
-                    ),
-                    (1, 1)
-                ),
-                title='METADATA',
-                title_align='left'
+                Padding(Group(*insights_metadata), (1, 1)),
+                title="METADATA",
+                title_align="left",
             )
         )
 
     if insights_operation:
         console.print(
             Panel(
-                Padding(
-                    Group(
-                        *insights_operation
-                    ),
-                    (1, 1)
-                ),
-                title='OPERATIONS',
-                title_align='left'
+                Padding(Group(*insights_operation), (1, 1)),
+                title="OPERATIONS",
+                title_align="left",
             )
         )
 
     if insights_dxt:
         console.print(
             Panel(
-                Padding(
-                    Group(
-                        *insights_dxt
-                    ),
-                    (1, 1)
-                ),
-                title='DXT',
-                title_align='left'
+                Padding(Group(*insights_dxt), (1, 1)), title="DXT", title_align="left"
             )
         )
 
 
 def display_thresholds(console):
     tholdMessage = {
-        'imbalance_operations': 'Minimum imbalance requests ratio:                [white]{}%[/white]'.format(thresholds['imbalance_operations'][0] * 100),
-        'small_bytes': 'Minimum size of a small request:                 [white]{} bytes[/white]'.format(thresholds['small_bytes'][0]),
-        'small_requests': 'Maximum small requests ratio:                    [white]{}%[/white]'.format(thresholds['small_requests'][0] * 100),
-        'small_requests_absolute': 'Maximum small requests:                          [white]{}[/white]'.format(thresholds['small_requests_absolute'][0]),
-        'misaligned_requests': 'Maximum misaligned requests ratio:               [white]{}%[/white]'.format(thresholds['misaligned_requests'][0] * 100),
-        'random_operations': 'Maximum random request ratio:                    [white]{}%[/white]'.format(thresholds['random_operations'][0] * 100),
-        'random_operations_absolute': 'Maximum random requests:                         [white]{}[/white]'.format(thresholds['random_operations_absolute'][0]),
-        'metadata_time_rank': 'Maximum metadata process time per rank:          [white]{} seconds[/white]'.format(thresholds['metadata_time_rank'][0]),
-        'imbalance_size': 'Maximum read/write size difference ratio:        [white]{}%[/white]'.format(thresholds['imbalance_size'][0] * 100),
-        'imbalance_stragglers': 'Maximum ratio difference among ranks:            [white]{}%[/white]'.format(thresholds['imbalance_stragglers'][0] * 100),
-        'interface_stdio': 'Maximum STDIO usage ratio:                       [white]{}%[/white]'.format(thresholds['interface_stdio'][0] * 100),
-        'collective_operations': 'Minimum MPI collective operation usage ratio:    [white]{}%[/white]'.format(thresholds['collective_operations'][0] * 100),
-        'collective_operations_absolute': 'Minimum MPI collective operations:               [white]{}[/white]'.format(thresholds['collective_operations_absolute'][0]),
+        "imbalance_operations": "Minimum imbalance requests ratio:                [white]{}%[/white]".format(
+            thresholds["imbalance_operations"][0] * 100
+        ),
+        "small_bytes": "Minimum size of a small request:                 [white]{} bytes[/white]".format(
+            thresholds["small_bytes"][0]
+        ),
+        "small_requests": "Maximum small requests ratio:                    [white]{}%[/white]".format(
+            thresholds["small_requests"][0] * 100
+        ),
+        "small_requests_absolute": "Maximum small requests:                          [white]{}[/white]".format(
+            thresholds["small_requests_absolute"][0]
+        ),
+        "misaligned_requests": "Maximum misaligned requests ratio:               [white]{}%[/white]".format(
+            thresholds["misaligned_requests"][0] * 100
+        ),
+        "random_operations": "Maximum random request ratio:                    [white]{}%[/white]".format(
+            thresholds["random_operations"][0] * 100
+        ),
+        "random_operations_absolute": "Maximum random requests:                         [white]{}[/white]".format(
+            thresholds["random_operations_absolute"][0]
+        ),
+        "metadata_time_rank": "Maximum metadata process time per rank:          [white]{} seconds[/white]".format(
+            thresholds["metadata_time_rank"][0]
+        ),
+        "imbalance_size": "Maximum read/write size difference ratio:        [white]{}%[/white]".format(
+            thresholds["imbalance_size"][0] * 100
+        ),
+        "imbalance_stragglers": "Maximum ratio difference among ranks:            [white]{}%[/white]".format(
+            thresholds["imbalance_stragglers"][0] * 100
+        ),
+        "interface_stdio": "Maximum STDIO usage ratio:                       [white]{}%[/white]".format(
+            thresholds["interface_stdio"][0] * 100
+        ),
+        "collective_operations": "Minimum MPI collective operation usage ratio:    [white]{}%[/white]".format(
+            thresholds["collective_operations"][0] * 100
+        ),
+        "collective_operations_absolute": "Minimum MPI collective operations:               [white]{}[/white]".format(
+            thresholds["collective_operations_absolute"][0]
+        ),
     }
 
     toBeAppend = []
@@ -1802,24 +2443,19 @@ def display_thresholds(console):
                 toBeAppend.append(message)
 
     console.print(
-        Panel(
-            '\n'.join(toBeAppend),
-            title='THRESHOLDS',
-            title_align='left',
-            padding=1
-        )
+        Panel("\n".join(toBeAppend), title="THRESHOLDS", title_align="left", padding=1)
     )
 
 
 def display_footer(console, insights_start_time, insights_end_time):
     console.print(
         Panel(
-            ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format(
+            " {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds".format(
                 datetime.datetime.now().year,
                 datetime.datetime.now(),
-                insights_end_time - insights_start_time
+                insights_end_time - insights_start_time,
             ),
-            box=box.SIMPLE
+            box=box.SIMPLE,
         )
     )
 
@@ -1828,37 +2464,28 @@ def export_html(console, export_dir, trace_name):
     if not args.export_html:
         return
 
-    os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
+    os.makedirs(export_dir, exist_ok=True)  # Ensure export directory exists
     filepath = os.path.join(export_dir, f"{trace_name}.html")
 
-    console.save_html(
-        filepath,
-        theme=set_export_theme(),
-        clear=False
-    )
+    console.save_html(filepath, theme=set_export_theme(), clear=False)
 
 
 def export_svg(console, export_dir, trace_name):
     if not args.export_svg:
         return
-    
-    os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
+
+    os.makedirs(export_dir, exist_ok=True)  # Ensure export directory exists
     filepath = os.path.join(export_dir, f"{trace_name}.svg")
 
-    console.save_svg(
-        filepath,
-        title='Drishti',
-        theme=set_export_theme(),
-        clear=False
-    )
+    console.save_svg(filepath, title="Drishti", theme=set_export_theme(), clear=False)
 
 
 def export_csv(export_dir, trace_name, jobid=None):
     if not args.export_csv:
         return
-    
+
     issues = [
-        'JOB',
+        "JOB",
         INSIGHTS_STDIO_HIGH_USAGE,
         INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE,
         INSIGHTS_POSIX_READ_COUNT_INTENSIVE,
@@ -1890,23 +2517,21 @@ def export_csv(export_dir, trace_name, jobid=None):
         INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE,
         INSIGHTS_MPI_IO_AGGREGATORS_INTRA,
         INSIGHTS_MPI_IO_AGGREGATORS_INTER,
-        INSIGHTS_MPI_IO_AGGREGATORS_OK
+        INSIGHTS_MPI_IO_AGGREGATORS_OK,
     ]
     if codes:
         issues.extend(codes)
 
     detected_issues = dict.fromkeys(issues, False)
-    detected_issues['JOB'] = jobid
+    detected_issues["JOB"] = jobid
 
     for report in csv_report:
         detected_issues[report] = True
 
-    
-    os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
+    os.makedirs(export_dir, exist_ok=True)  # Ensure export directory exists
     filepath = os.path.join(export_dir, f"{trace_name}.csv")
 
-    with open(filepath, 'w') as f:
+    with open(filepath, "w") as f:
         w = csv.writer(f)
         w.writerow(detected_issues.keys())
         w.writerow(detected_issues.values())
-

From 478612a3ed3a9ea0c4368f5d8bdb6190e07587a7 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 17 Mar 2025 23:22:20 -0600
Subject: [PATCH 02/43] chore: Explicitly define imports from
 `drishti.includes.config`

---
 drishti/includes/module.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index c0d91ef..e7f70d6 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1,12 +1,37 @@
 #!/usr/bin/env python3
 
-import datetime
 import csv
+import datetime
+import os
 import time
+
 import pandas as pd
 from rich import box
+from rich.console import Group
+from rich.padding import Padding
+from rich.panel import Panel
 from rich.syntax import Syntax
+
 from drishti.includes.config import *
+from drishti.includes.config import (
+    HIGH,
+    INFO,
+    OK,
+    ROOT,
+    TARGET_DEVELOPER,
+    TARGET_USER,
+    WARN,
+    codes,
+    convert_bytes,
+    csv_report,
+    insights_dxt,
+    insights_metadata,
+    insights_operation,
+    message,
+    set_export_theme,
+    thresholds,
+)
+from drishti.includes.parser import args
 
 """
 Before calling the functions below

From 23e64e20a796d28ecef27ea2b699aed14a8b5e57 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 17 Mar 2025 23:27:01 -0600
Subject: [PATCH 03/43] chore: Add PyCharm configuration files

---
 .idea/.gitignore                               |  8 ++++++++
 .idea/drishti-io.iml                           | 14 ++++++++++++++
 .idea/inspectionProfiles/profiles_settings.xml |  6 ++++++
 .idea/misc.xml                                 |  4 ++++
 .idea/modules.xml                              |  8 ++++++++
 .idea/vcs.xml                                  |  6 ++++++
 6 files changed, 46 insertions(+)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/drishti-io.iml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml
new file mode 100644
index 0000000..7b26d7f
--- /dev/null
+++ b/.idea/drishti-io.iml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.13 (drishti-io)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..1d40550
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (drishti-io)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..c4fcf4c
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/drishti-io.iml" filepath="$PROJECT_DIR$/.idea/drishti-io.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file

From 4a83149f5006d5bae38f8b3821e0dd0a47db0d6c Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 24 Mar 2025 11:51:19 -0600
Subject: [PATCH 04/43] feat: Update log path argument to support multiple
 inputs and enhance log type checking

---
 drishti/includes/parser.py | 120 +++++++++++++++++--------------------
 drishti/reporter.py        |  62 +++++++++++++------
 2 files changed, 101 insertions(+), 81 deletions(-)

diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py
index 28dcd63..ed58b1d 100644
--- a/drishti/includes/parser.py
+++ b/drishti/includes/parser.py
@@ -1,128 +1,120 @@
 import argparse
 
-parser = argparse.ArgumentParser(
-    description='Drishti: '
-)
+parser = argparse.ArgumentParser(description="Drishti: ")
 
 parser.add_argument(
-    'log_path',
-    help='Input .darshan file or recorder folder'
+    "log_paths", nargs="+", help="Input .darshan file or recorder folder"
 )
 
 parser.add_argument(
-    '--issues',
+    "--issues",
     default=False,
-    action='store_true',
-    dest='only_issues',
-    help='Only displays the detected issues and hides the recommendations'
+    action="store_true",
+    dest="only_issues",
+    help="Only displays the detected issues and hides the recommendations",
 )
 
 parser.add_argument(
-    '--html',
+    "--html",
     default=False,
-    action='store_true',
-    dest='export_html',
-    help='Export the report as an HTML page'
+    action="store_true",
+    dest="export_html",
+    help="Export the report as an HTML page",
 )
 
 parser.add_argument(
-    '--svg',
+    "--svg",
     default=False,
-    action='store_true',
-    dest='export_svg',
-    help='Export the report as an SVG image'
+    action="store_true",
+    dest="export_svg",
+    help="Export the report as an SVG image",
 )
 
 parser.add_argument(
-    '--light',
+    "--light",
     default=False,
-    action='store_true',
-    dest='export_theme_light',
-    help='Use a light theme for the report when generating files'
+    action="store_true",
+    dest="export_theme_light",
+    help="Use a light theme for the report when generating files",
 )
 
 parser.add_argument(
-    '--size',
+    "--size",
     default=False,
-    dest='export_size',
-    help='Console width used for the report and generated files'
+    dest="export_size",
+    help="Console width used for the report and generated files",
 )
 
 parser.add_argument(
-    '--verbose',
+    "--verbose",
     default=False,
-    action='store_true',
-    dest='verbose',
-    help='Display extended details for the recommendations'
+    action="store_true",
+    dest="verbose",
+    help="Display extended details for the recommendations",
 )
 
 parser.add_argument(
-    '--threshold',
+    "--threshold",
     default=False,
-    action='store_true',
-    dest='thold',
-    help='Display all thresholds used for the report'
+    action="store_true",
+    dest="thold",
+    help="Display all thresholds used for the report",
 )
 
 parser.add_argument(
-    '--code',
+    "--code",
     default=False,
-    action='store_true',
-    dest='code',
-    help='Display insights identification code'
+    action="store_true",
+    dest="code",
+    help="Display insights identification code",
 )
 
 parser.add_argument(
-    '--backtrace',
+    "--backtrace",
     default=False,
-    action='store_true',
-    dest='backtrace',
-    help='Enable DXT insights and backtrace'
+    action="store_true",
+    dest="backtrace",
+    help="Enable DXT insights and backtrace",
 )
 
 parser.add_argument(
-    '--path',
+    "--path",
     default=False,
-    action='store_true',
-    dest='full_path',
-    help='Display the full file path for the files that triggered the issue'
+    action="store_true",
+    dest="full_path",
+    help="Display the full file path for the files that triggered the issue",
 )
 
 parser.add_argument(
-    '--csv',
+    "--csv",
     default=False,
-    action='store_true',
-    dest='export_csv',
-    help='Export a CSV with the code of all issues that were triggered'
+    action="store_true",
+    dest="export_csv",
+    help="Export a CSV with the code of all issues that were triggered",
 )
 
 parser.add_argument(
-    '--export_dir',
+    "--export_dir",
     default="",
-    dest='export_dir',
-    help='Specify the directory prefix for the output files (if any)'
+    dest="export_dir",
+    help="Specify the directory prefix for the output files (if any)",
 )
 
-parser.add_argument(
-    '--json', 
-    default=False, 
-    dest='json',
-    help=argparse.SUPPRESS
-)
+parser.add_argument("--json", default=False, dest="json", help=argparse.SUPPRESS)
 
 parser.add_argument(
-    '--split',
+    "--split",
     default=False,
-    action='store_true',
-    dest='split_files',
-    help='Split the files and generate report for each file'
+    action="store_true",
+    dest="split_files",
+    help="Split the files and generate report for each file",
 )
 
 parser.add_argument(
-    '--config',
+    "--config",
     default=False,
-    dest='config',
-    help='Enable thresholds read from json file'
+    dest="config",
+    help="Enable thresholds read from json file",
 )
 
 args = parser.parse_args()
diff --git a/drishti/reporter.py b/drishti/reporter.py
index 8455040..a6a8401 100644
--- a/drishti/reporter.py
+++ b/drishti/reporter.py
@@ -3,10 +3,12 @@
 import os
 import sys
 from subprocess import call
-from drishti.includes.parser import *
+from typing import List, Optional
 
+# from includes.parser import * # imports {'parser', 'args', 'argparse'}
+from drishti.includes.parser import args
 
-'''
+"""
                          |- handler_darshan   -|
                          |                     |
 reporter -> /handlers -> |- handler_recorder  -|   -| 
@@ -15,8 +17,7 @@
     ________________________________________________|
     |
     |-----> /includes -> module -> config -> parser
-'''
-
+"""
 
 LOG_TYPE_DARSHAN = 0
 LOG_TYPE_RECORDER = 1
@@ -26,30 +27,57 @@ def clear():
     """
     Clear the screen with the comment call based on the operating system.
     """
-    _ = call('clear' if os.name == 'posix' else 'cls')
+    _ = call("clear" if os.name == "posix" else "cls")
+
+
+def check_log_type(paths: List[str]) -> Optional[int]:
+    is_darshan = True
+    is_recorder = True
+    multiple_logs = len(paths) > 1
 
+    for path in paths:
+        if path.endswith(".darshan"):
+            if not os.path.isfile(path):
+                print("Unable to open .darshan file.")
+                sys.exit(os.EX_NOINPUT)
+            else:
+                is_darshan = True and is_darshan
+                is_recorder = False and is_recorder
+        else:  # check whether is a valid recorder log
+            if not os.path.isdir(path):
+                print("Unable to open recorder folder.")
+                sys.exit(os.EX_NOINPUT)
+            else:
+                is_recorder = True and is_recorder
+                is_darshan = False and is_darshan
 
-def check_log_type(path):
-    if path.endswith('.darshan'):
-        if not os.path.isfile(path):
-            print('Unable to open .darshan file.')
+    if multiple_logs:
+        if is_darshan:
+            return LOG_TYPE_DARSHAN
+        else:
+            print("Only .darshan files are supported for multiple logs.")
             sys.exit(os.EX_NOINPUT)
-        else: return LOG_TYPE_DARSHAN
-    else: # check whether is a valid recorder log
-        if not os.path.isdir(path):
-            print('Unable to open recorder folder.')
+    else:
+        if is_darshan and not is_recorder:
+            return LOG_TYPE_DARSHAN
+        elif is_recorder and not is_darshan:
+            return LOG_TYPE_RECORDER
+        else:
+            print("Unable to reliably determine the log type.")
             sys.exit(os.EX_NOINPUT)
-        else: return LOG_TYPE_RECORDER
 
 
 def main():
-    log_type = check_log_type(args.log_path)
-    
+    log_type = check_log_type(args.log_paths)
+
     if log_type == LOG_TYPE_DARSHAN:
         from drishti.handlers.handle_darshan import handler
 
     elif log_type == LOG_TYPE_RECORDER:
         from drishti.handlers.handle_recorder import handler
-    
+
     handler()
 
+
+if __name__ == "__main__":
+    main()

From 2940cfeeca792fb0477c922019f51a11bd5ed18e Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 24 Mar 2025 12:06:47 -0600
Subject: [PATCH 05/43] chore: Add Black component configuration for Python
 3.13 SDK

---
 .idea/misc.xml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.idea/misc.xml b/.idea/misc.xml
index 1d40550..a366115 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (drishti-io)" />
+  </component>
   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (drishti-io)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file

From 4b1a58f0cc44d0155780159620b3597544264429 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 24 Mar 2025 12:14:55 -0600
Subject: [PATCH 06/43] chore: Refactor imports in handle_darshan.py for
 clarity and organization

---
 drishti/handlers/handle_darshan.py | 55 ++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index ea690f3..8a16b71 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -1,18 +1,59 @@
 #!/usr/bin/env python3
 
+import csv
+import datetime
 import io
-import sys
-import time
+import os
 import shlex
 import shutil
 import subprocess
-import pandas as pd
+import sys
+import time
+
 import darshan
 import darshan.backend.cffi_backend as darshanll
-
-from rich import print
+import pandas as pd
 from packaging import version
-from drishti.includes.module import *
+from rich import print
+from rich.padding import Padding
+from rich.panel import Panel
+
+from drishti.includes.config import (
+    HIGH,
+    RECOMMENDATIONS,
+    WARN,
+    init_console,
+    insights_total,
+    thresholds,
+)
+# from drishti.includes.module import *
+from drishti.includes.module import (
+    check_individual_read_imbalance,
+    check_individual_write_imbalance,
+    check_long_metadata,
+    check_misaligned,
+    check_mpi_aggregator,
+    check_mpi_collective_read_operation,
+    check_mpi_collective_write_operation,
+    check_mpi_none_block_operation,
+    check_mpiio,
+    check_operation_intensive,
+    check_random_operation,
+    check_shared_data_imblance,
+    check_shared_small_operation,
+    check_shared_time_imbalance,
+    check_size_intensive,
+    check_small_operation,
+    check_stdio,
+    check_traffic,
+    display_content,
+    display_footer,
+    display_thresholds,
+    export_csv,
+    export_html,
+    export_svg,
+)
+from drishti.includes.parser import args
 
 
 def is_available(name):
@@ -494,7 +535,7 @@ def handler():
         detected_files = []
 
         stragglers_count = 0
-        stragglers_imbalance = {}
+        # stragglers_imbalance = {}
 
         shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str))
 

From b09300dde91e3fcefc5cd8acc7525e816f79bba4 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 24 Mar 2025 12:19:24 -0600
Subject: [PATCH 07/43] hotfix: Update log path handling to support multiple
 log paths and ensure consistency

---
 drishti/handlers/handle_darshan.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 8a16b71..4fc3c3a 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -116,7 +116,8 @@ def handler():
 
     insights_start_time = time.time()
 
-    log = darshanll.log_open(args.log_path)
+    darshan_log_path = args.log_paths[0]
+    log = darshanll.log_open(darshan_log_path)
 
     modules = darshanll.log_get_modules(log)
 
@@ -129,8 +130,8 @@ def handler():
     library_version = darshanll.get_lib_version()
 
     # Make sure log format is of the same version
-    filename = args.log_path
-    # check_log_version(console, args.log_path, log_version, library_version)
+    filename = darshan_log_path
+    # check_log_version(console, darshan_log_path, log_version, library_version)
  
     darshanll.log_close(log)
 
@@ -752,7 +753,7 @@ def handler():
                     job['exe'].split()[0]
                 ),
                 ' [b]DARSHAN[/b]:        [white]{}[/white]'.format(
-                    os.path.basename(args.log_path)
+                    os.path.basename(darshan_log_path)
                 ),
                 ' [b]EXECUTION TIME[/b]: [white]{} to {} ({:.2f} hours)[/white]'.format(
                     job_start,
@@ -794,7 +795,7 @@ def handler():
     display_footer(console, insights_start_time, insights_end_time)
 
     # Export to HTML, SVG, and CSV
-    trace_name = os.path.basename(args.log_path).replace('.darshan', '')
+    trace_name = os.path.basename(darshan_log_path).replace('.darshan', '')
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
     export_html(console, out_dir, trace_name)

From b6bfec2c7e628bd587b3025823771a163819cee5 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sat, 29 Mar 2025 13:56:23 -0600
Subject: [PATCH 08/43] chore: Update project SDK name from Python 3.13 to uv
 in IDE configuration files

---
 .idea/drishti-io.iml | 2 +-
 .idea/misc.xml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml
index 7b26d7f..c645b1e 100644
--- a/.idea/drishti-io.iml
+++ b/.idea/drishti-io.iml
@@ -4,7 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/.venv" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.13 (drishti-io)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="uv (drishti-io)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index a366115..90404e0 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,5 +3,5 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.13 (drishti-io)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (drishti-io)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="uv (drishti-io)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file

From fcf1c2519124be546ccd95e75efbc597553c18a9 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sat, 29 Mar 2025 14:00:10 -0600
Subject: [PATCH 09/43] chore: Add Mypy configuration and update setup.py for
 development dependencies

---
 .idea/MypyPlugin.xml | 8 ++++++++
 setup.py             | 7 +++++++
 2 files changed, 15 insertions(+)
 create mode 100644 .idea/MypyPlugin.xml

diff --git a/.idea/MypyPlugin.xml b/.idea/MypyPlugin.xml
new file mode 100644
index 0000000..ac4cd76
--- /dev/null
+++ b/.idea/MypyPlugin.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MypySettings">
+    <option name="arguments" value="--follow-imports silent --exclude \.pyi$" />
+    <option name="mypyExecutable" value="$PROJECT_DIR$/.venv/bin/mypy" />
+    <option name="projectDirectory" value="$PROJECT_DIR$" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a93a8ce..c3b9d6c 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,13 @@
         'rich==12.5.1',
         'recorder-utils',
     ],
+    extras_require={
+        'dev': [
+            'ruff',
+            'isort',
+            'mypy'
+        ],
+    },
     packages=find_packages(),
     package_data={
         'drishti.includes': [

From 611bfa177c89e39d25ad1ac847a4e8e7963e1c3e Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 1 Apr 2025 22:14:38 -0600
Subject: [PATCH 10/43] chore: Exclude .history folder from module content in
 IDE configuration

---
 .idea/drishti-io.iml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml
index c645b1e..eb59906 100644
--- a/.idea/drishti-io.iml
+++ b/.idea/drishti-io.iml
@@ -2,6 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.history" />
       <excludeFolder url="file://$MODULE_DIR$/.venv" />
     </content>
     <orderEntry type="jdk" jdkName="uv (drishti-io)" jdkType="Python SDK" />

From f2cdd50eb03f381aedd5bd6343125a9c8fea9498 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 1 Apr 2025 22:16:16 -0600
Subject: [PATCH 11/43] chore: Exclude
 sample/tensorflow_unet3d_darshan_per_rank_workload from project configuration

---
 .gitignore           | 2 ++
 .idea/drishti-io.iml | 1 +
 2 files changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index d3c0162..74cfd33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+sample/tensorflow_unet3d_darshan_per_rank_workload
+
 # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,pycharm,visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,pycharm,visualstudiocode
 
diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml
index eb59906..883789c 100644
--- a/.idea/drishti-io.iml
+++ b/.idea/drishti-io.iml
@@ -4,6 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/.history" />
       <excludeFolder url="file://$MODULE_DIR$/.venv" />
+      <excludeFolder url="file://$MODULE_DIR$/sample/tensorflow_unet3d_darshan_per_rank_workload" />
     </content>
     <orderEntry type="jdk" jdkName="uv (drishti-io)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />

From fa2cd1969bae6206c2db57ba30b7caf86d3f05c9 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 1 Apr 2025 22:17:38 -0600
Subject: [PATCH 12/43] refactor: Update argument access to use parser module
 for consistency

---
 drishti/handlers/handle_darshan.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 4fc3c3a..c687d4c 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -53,7 +53,8 @@
     export_html,
     export_svg,
 )
-from drishti.includes.parser import args
+import drishti.includes.parser as parser
+# from drishti.includes.parser import args
 
 
 def is_available(name):
@@ -116,7 +117,7 @@ def handler():
 
     insights_start_time = time.time()
 
-    darshan_log_path = args.log_paths[0]
+    darshan_log_path = parser.args.log_paths[0]
     log = darshanll.log_open(darshan_log_path)
 
     modules = darshanll.log_get_modules(log)
@@ -199,12 +200,12 @@ def handler():
     if "LUSTRE" in report.records:
         df_lustre = report.records['LUSTRE'].to_df()
     
-    if args.backtrace:
+    if parser.args.backtrace:
         if "DXT_POSIX" in report.records:
             dxt_posix = report.records["DXT_POSIX"].to_df()
             dxt_posix = pd.DataFrame(dxt_posix)
             if "address_line_mapping" not in dxt_posix:
-                args.backtrace = False
+                parser.args.backtrace = False
             else:
                 read_id = []
                 read_rank = []
@@ -349,7 +350,7 @@ def handler():
         # Get total number of I/O operations
         total_operations = total_writes + total_reads 
 
-        # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
+        # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
         check_operation_intensive(total_operations, total_reads, total_writes)
 
         total_read_size = df['counters']['POSIX_BYTES_READ'].sum()
@@ -796,7 +797,7 @@ def handler():
 
     # Export to HTML, SVG, and CSV
     trace_name = os.path.basename(darshan_log_path).replace('.darshan', '')
-    out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
+    out_dir = parser.args.export_dir if parser.args.export_dir != "" else os.getcwd()
 
     export_html(console, out_dir, trace_name)
     export_svg(console, out_dir, trace_name)

From 148ee9a74b81c61ade7e47e71511e7fbf121b6db Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 1 Apr 2025 22:18:20 -0600
Subject: [PATCH 13/43] fix: Add type ignore comments for darshan imports to
 resolve type checking issues

---
 drishti/handlers/handle_darshan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index c687d4c..a8453c9 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -10,8 +10,8 @@
 import sys
 import time
 
-import darshan
-import darshan.backend.cffi_backend as darshanll
+import darshan  # type: ignore
+import darshan.backend.cffi_backend as darshanll  # type: ignore
 import pandas as pd
 from packaging import version
 from rich import print

From 783e3b42d4c4a741b0ac2712c79edca9acb70017 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 1 Apr 2025 22:18:55 -0600
Subject: [PATCH 14/43] feat: Import DarshanFile from darshan_util for enhanced
 functionality

---
 drishti/handlers/darshan_util.py   | 290 +++++++++++++++++++++++++++++
 drishti/handlers/handle_darshan.py |   2 +
 2 files changed, 292 insertions(+)
 create mode 100644 drishti/handlers/darshan_util.py

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
new file mode 100644
index 0000000..f44bb50
--- /dev/null
+++ b/drishti/handlers/darshan_util.py
@@ -0,0 +1,290 @@
+import datetime
+import typing
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import cached_property
+from typing import Dict, Final, Optional, Union
+
+import pandas as pd
+
+
+class ModuleType(str, Enum):
+    """Enum for standard I/O module types"""
+    POSIX = "posix"
+    STDIO = "stdio"
+    MPIIO = "mpiio"
+    
+    def __str__(self) -> str:
+        return self.value
+
+
+@dataclass
+class TimeSpan:
+    start: datetime.datetime
+    end: datetime.datetime
+
+    def __post_init__(self):
+        if self.start > self.end:
+            raise ValueError(f"TimeSpan start ({self.start}) must be <= end ({self.end})")
+
+@dataclass
+class IOCounter:
+    """Base class for I/O metrics with read/write counts"""
+    read: Final[int] = field(init=True)
+    write: Final[int] = field(init=True)
+    _total: Optional[int] = None
+
+    @cached_property
+    def total(self) -> int:
+        """Total count, calculated once on first access"""
+        if self._total is not None:
+            return self._total
+        return self.read + self.write
+
+@dataclass
+class IOSize(IOCounter):
+    """Represents I/O size statistics in bytes"""
+    pass
+
+@dataclass
+class IOOperation(IOCounter):
+    """Represents I/O operation count statistics"""
+    pass
+
+
+@dataclass
+class IOStatistics:
+    """Tracks both I/O sizes and operations by module with aggregated metrics"""
+    # Use dicts to store module-specific data
+    sizes: Dict[Union[ModuleType, str], IOSize] = field(default_factory=dict)
+    operations: Dict[Union[ModuleType, str], IOOperation] = field(default_factory=dict)
+
+    def __post_init__(self):
+        # Initialize standard modules if not present
+        for module in ModuleType:
+            # Ensure that the module is either in both sizes and operations or in neither
+            assert (module in self.sizes) == (module in self.operations), f"Module {module} should be in both sizes and operations or in neither"
+
+            if module not in self.sizes:
+                self.sizes[module] = IOSize(read=0, write=0)
+            if module not in self.operations:
+                self.operations[module] = IOOperation(read=0, write=0)
+
+    # Convenience properties for standard modules
+    @cached_property
+    def posix_size(self) -> int:
+        return self.sizes[ModuleType.POSIX].total
+
+    @cached_property
+    def stdio_size(self) -> int:
+        return self.sizes[ModuleType.STDIO].total
+
+    @cached_property
+    def mpiio_size(self) -> int:
+        return self.sizes[ModuleType.MPIIO].total
+
+    @cached_property
+    def posix_ops(self) -> int:
+        return self.operations[ModuleType.POSIX].total
+
+    @cached_property
+    def stdio_ops(self) -> int:
+        return self.operations[ModuleType.STDIO].total
+
+    @cached_property
+    def mpiio_ops(self) -> int:
+        return self.operations[ModuleType.MPIIO].total
+
+    # Aggregated size properties
+    @cached_property
+    def read_bytes(self) -> int:
+        """Total bytes read across all modules."""
+        return sum(size.read for size in self.sizes.values())
+
+    @cached_property
+    def written_bytes(self) -> int:
+        """Total bytes written across all modules."""
+        return sum(size.write for size in self.sizes.values())
+
+    @cached_property
+    def total_bytes(self) -> int:
+        """Total bytes transferred across all modules."""
+        return self.read_bytes + self.written_bytes
+
+    # Aggregated operation properties
+    @cached_property
+    def reads(self) -> int:
+        """Total read operations across all modules."""
+        return sum(op.read for op in self.operations.values())
+
+    @cached_property
+    def writes(self) -> int:
+        """Total write operations across all modules."""
+        return sum(op.write for op in self.operations.values())
+
+    @cached_property
+    def total_ops(self) -> int:
+        """Total operations across all modules."""
+        return self.reads + self.writes
+
+    # Methods to get stats for specific modules
+    def get_module_size(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int:
+        """Get size statistics for a specific module or all modules if not specified."""
+        if module is None and data_type is None:
+            raise ValueError("Both module and data_type cannot be None")
+            
+        if module:
+            if module not in self.sizes:
+                raise ValueError(f"Module {module} not found in sizes")
+            size = self.sizes[module]
+            if data_type == "read":
+                return size.read
+            elif data_type == "write":
+                return size.write
+            else:  # data_type is None or "total"
+                return size.total
+        else:
+            if data_type == "read":
+                return self.read_bytes
+            elif data_type == "write":
+                return self.written_bytes
+            else:  # data_type is None or "total"
+                return self.total_bytes
+
+    def get_module_ops(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int:
+        """Get operation statistics for a specific module or all modules if not specified."""
+        if module is None and data_type is None:
+            raise ValueError("Both module and data_type cannot be None")
+            
+        if module:
+            if module not in self.operations:
+                raise ValueError(f"Module {module} not found in operations")
+            ops = self.operations[module]
+            if data_type == "read":
+                return ops.read
+            elif data_type == "write":
+                return ops.write
+            else:  # data_type is None or "total"
+                return ops.total
+        else:
+            if data_type == "read":
+                return self.reads
+            elif data_type == "write":
+                return self.writes
+            else:  # data_type is None or "total"
+                return self.total_ops
+
+@dataclass
+class SmallIOStats(IOCounter):
+    """Statistics for small I/O operations"""
+    pass  # Inherits read/write/total from IOCounter
+
+@dataclass
+class SharedOpsStats(IOCounter):
+    """Statistics for shared file operations"""
+    pass  # Inherits read/write/total from IOCounter
+
+@dataclass
+class SharedSmallOpsStats(IOCounter):
+    """Statistics for small shared file operations"""
+    pass  # Inherits read/write/total from IOCounter
+
+@dataclass
+class ConsecutiveIOStats(IOCounter):
+    """Statistics for consecutive I/O operations"""
+    pass  # Inherits read/write/total from IOCounter
+
+@dataclass
+class SequentialIOStats(IOCounter):
+    """Statistics for sequential I/O operations"""
+    pass  # Inherits read/write/total from IOCounter
+
+@dataclass
+class RandomIOStats(IOCounter):
+    """Statistics for random I/O operations"""
+    pass  # Inherits read/write/total from IOCounter
+
+@dataclass
+class MPIIONonBlockingStats(IOCounter):
+    """Statistics for non-blocking MPI I/O operations"""
+    pass
+
+@dataclass
+class MPICollectiveIOStats(IOCounter):
+    """Statistics for collective MPI I/O operations"""
+    pass
+
+@dataclass
+class MPIIndependentIOStats(IOCounter):
+    """Statistics for independent MPI I/O operations"""
+    pass
+
+@dataclass
+class AccessPatternStats:
+    """Statistics for I/O access patterns by pattern type"""
+    consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0))
+    sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0))
+    random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0))
+
+@dataclass
+class DarshanFile:
+    # TODO: All fields which are not calculated should be instantly populated and not optional
+    # TODO: Explore using typeddicts instead of dicts
+    job_id: Optional[str] = None
+    log_ver: Optional[str] = None
+    time: Optional[TimeSpan] = None
+    exe: Optional[str] = None
+    modules: Optional[typing.Iterable[str]] = None
+    name_records: Optional[typing.Dict[str, str]] = None
+    max_read_offset: Optional[int] = None
+    max_write_offset: Optional[int] = None
+    total_files_stdio: Optional[int] = None
+    total_files_posix: Optional[int] = None
+    total_files_mpiio: Optional[int] = None
+    files: Optional[typing.Dict[str, str]] = None
+    
+    # Replace individual I/O stats with IOStatistics class
+    io_stats: Optional[IOStatistics] = None
+    
+    # File counts
+    total_files: Optional[int] = 0
+    
+    # Additional I/O statistics organized by category
+    small_io: Optional[SmallIOStats] = None
+    
+    # Direct alignment fields instead of a class
+    mem_not_aligned: Optional[int] = None
+    file_not_aligned: Optional[int] = None
+    
+    access_pattern: Optional[AccessPatternStats] = None
+    
+    # Use separate classes for shared operations
+    shared_ops: Optional[SharedOpsStats] = None
+    shared_small_ops: Optional[SharedSmallOpsStats] = None
+
+    count_long_metadata: Optional[int] = None
+    posix_shared_data_imbalance_stragglers_count: Optional[int] = None
+
+    has_hdf5_extension: Optional[bool] = None
+
+    mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None
+
+    cb_nodes: Optional[int] = None
+    number_of_compute_nodes: Optional[int] = None
+    hints: Optional[list[str]] = None
+
+    timestamp: Optional[TimeSpan] = None
+
+    aggregated: Optional[pd.DataFrame] = None
+
+    mpi_coll_ops: Optional[MPICollectiveIOStats] = None
+    mpi_indep_ops: Optional[MPIIndependentIOStats] = None
+
+    detected_files_mpi_coll_reads: Optional[pd.DataFrame] = None
+    detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None
+
+    imbalance_count_posix_shared_time: Optional[int] = None
+    posix_shared_time_imbalance_detected_files: Optional[tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None
+
+
+
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index a8453c9..ce9e4e4 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -18,6 +18,8 @@
 from rich.padding import Padding
 from rich.panel import Panel
 
+from drishti.handlers.darshan_util import DarshanFile
+
 from drishti.includes.config import (
     HIGH,
     RECOMMENDATIONS,

From 90d23e8f1de8eb61fb13eba7029ae067ab6dd7c5 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 1 Apr 2025 23:00:41 -0600
Subject: [PATCH 15/43] fix: Update default_factory arguments in
 AccessPatternStats for proper initialization

---
 drishti/handlers/darshan_util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index f44bb50..175e453 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -222,9 +222,9 @@ class MPIIndependentIOStats(IOCounter):
 @dataclass
 class AccessPatternStats:
     """Statistics for I/O access patterns by pattern type"""
-    consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0))
-    sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0))
-    random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0))
+    consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True)
+    sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0), init=True)
+    random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0), init=True)
 
 @dataclass
 class DarshanFile:

From ba3381d813032e9964f137ce6076b91a2e4093a9 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Thu, 3 Apr 2025 11:46:42 -0600
Subject: [PATCH 16/43] fix: Update type hints in for Python 3.8 compatability

---
 drishti/handlers/darshan_util.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 175e453..6c9090e 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from functools import cached_property
-from typing import Dict, Final, Optional, Union
+from typing import Dict, Final, Optional, Union, List, Tuple, Iterable
 
 import pandas as pd
 
@@ -234,14 +234,14 @@ class DarshanFile:
     log_ver: Optional[str] = None
     time: Optional[TimeSpan] = None
     exe: Optional[str] = None
-    modules: Optional[typing.Iterable[str]] = None
-    name_records: Optional[typing.Dict[str, str]] = None
+    modules: Optional[Iterable[str]] = None
+    name_records: Optional[Dict[str, str]] = None
     max_read_offset: Optional[int] = None
     max_write_offset: Optional[int] = None
     total_files_stdio: Optional[int] = None
     total_files_posix: Optional[int] = None
     total_files_mpiio: Optional[int] = None
-    files: Optional[typing.Dict[str, str]] = None
+    files: Optional[Dict[str, str]] = None
     
     # Replace individual I/O stats with IOStatistics class
     io_stats: Optional[IOStatistics] = None
@@ -271,7 +271,7 @@ class DarshanFile:
 
     cb_nodes: Optional[int] = None
     number_of_compute_nodes: Optional[int] = None
-    hints: Optional[list[str]] = None
+    hints: Optional[List[str]] = None
 
     timestamp: Optional[TimeSpan] = None
 
@@ -284,7 +284,7 @@ class DarshanFile:
     detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None
 
     imbalance_count_posix_shared_time: Optional[int] = None
-    posix_shared_time_imbalance_detected_files: Optional[tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None
+    posix_shared_time_imbalance_detected_files: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None
 
 
 

From efdd6f6c835968ab2ec6ad4c0e93199b588abe83 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 14 Apr 2025 17:08:30 -0600
Subject: [PATCH 17/43] refactor: Consolidate module function calls under the
 'module' namespace for improved organization

---
 drishti/handlers/handle_darshan.py | 106 +++++++++++++++--------------
 1 file changed, 55 insertions(+), 51 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index ce9e4e4..633238e 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -12,13 +12,14 @@
 
 import darshan  # type: ignore
 import darshan.backend.cffi_backend as darshanll  # type: ignore
+import numpy as np
 import pandas as pd
 from packaging import version
 from rich import print
 from rich.padding import Padding
 from rich.panel import Panel
 
-from drishti.handlers.darshan_util import DarshanFile
+from drishti.handlers.darshan_util import DarshanFile, ModuleType
 
 from drishti.includes.config import (
     HIGH,
@@ -28,33 +29,36 @@
     insights_total,
     thresholds,
 )
+
 # from drishti.includes.module import *
-from drishti.includes.module import (
-    check_individual_read_imbalance,
-    check_individual_write_imbalance,
-    check_long_metadata,
-    check_misaligned,
-    check_mpi_aggregator,
-    check_mpi_collective_read_operation,
-    check_mpi_collective_write_operation,
-    check_mpi_none_block_operation,
-    check_mpiio,
-    check_operation_intensive,
-    check_random_operation,
-    check_shared_data_imblance,
-    check_shared_small_operation,
-    check_shared_time_imbalance,
-    check_size_intensive,
-    check_small_operation,
-    check_stdio,
-    check_traffic,
-    display_content,
-    display_footer,
-    display_thresholds,
-    export_csv,
-    export_html,
-    export_svg,
-)
+import drishti.includes.module as module
+
+# from drishti.includes.module import (
+#     check_individual_read_imbalance,
+#     check_individual_write_imbalance,
+#     check_long_metadata,
+#     check_misaligned,
+#     check_mpi_aggregator,
+#     check_mpi_collective_read_operation,
+#     check_mpi_collective_write_operation,
+#     check_mpi_none_block_operation,
+#     check_mpiio,
+#     check_operation_intensive,
+#     check_random_operation,
+#     check_shared_data_imblance,
+#     check_shared_small_operation,
+#     check_shared_time_imbalance,
+#     check_size_intensive,
+#     check_small_operation,
+#     check_stdio,
+#     check_traffic,
+#     display_content,
+#     display_footer,
+#     display_thresholds,
+#     export_csv,
+#     export_html,
+#     export_svg,
+# )
 import drishti.includes.parser as parser
 # from drishti.includes.parser import args
 
@@ -335,8 +339,8 @@ def handler():
             'mpiio': uses_mpiio
         }
 
-    check_stdio(total_size, total_size_stdio)
-    check_mpiio(modules)
+    module.check_stdio(total_size, total_size_stdio)
+    module.check_mpiio(modules)
 
     #########################################################################################################################################################################
 
@@ -353,14 +357,14 @@ def handler():
         total_operations = total_writes + total_reads 
 
         # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
-        check_operation_intensive(total_operations, total_reads, total_writes)
+        module.check_operation_intensive(total_operations, total_reads, total_writes)
 
         total_read_size = df['counters']['POSIX_BYTES_READ'].sum()
         total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum()
 
         total_size = total_written_size + total_read_size
 
-        check_size_intensive(total_size, total_read_size, total_written_size)
+        module.check_size_intensive(total_size, total_read_size, total_written_size)
 
         #########################################################################################################################################################################
 
@@ -404,7 +408,7 @@ def handler():
         detected_files.columns = ['id', 'total_reads', 'total_writes']
         detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
 
-        check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
 
         #########################################################################################################################################################################
 
@@ -413,7 +417,7 @@ def handler():
         total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum()
         total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
 
-        check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
+        module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
 
         #########################################################################################################################################################################
 
@@ -422,7 +426,7 @@ def handler():
         max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max()
         max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
 
-        check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
 
         #########################################################################################################################################################################
 
@@ -447,7 +451,7 @@ def handler():
         write_random = total_writes - write_consecutive - write_sequential
         #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
 
-        check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
 
         #########################################################################################################################################################################
 
@@ -493,13 +497,13 @@ def handler():
                 shared_files['POSIX_SIZE_WRITE_100K_1M']
             )
 
-            check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
+            module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
 
         #########################################################################################################################################################################
 
         count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])])
 
-        check_long_metadata(count_long_metadata, modules)
+        module.check_long_metadata(count_long_metadata, modules)
 
         # We already have a single line for each shared-file access
         # To check for stragglers, we can check the difference between the 
@@ -527,7 +531,7 @@ def handler():
 
         column_names = ['id', 'data_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
 
         # POSIX_F_FASTEST_RANK_TIME
         # POSIX_F_SLOWEST_RANK_TIME
@@ -555,7 +559,7 @@ def handler():
 
         column_names = ['id', 'time_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_shared_time_imbalance(stragglers_count, detected_files, file_map)
+        module.check_shared_time_imbalance(stragglers_count, detected_files, file_map)
 
         aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
             ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
@@ -584,7 +588,7 @@ def handler():
 
         column_names = ['id', 'write_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
+        module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
 
         imbalance_count = 0
 
@@ -600,7 +604,7 @@ def handler():
 
         column_names = ['id', 'read_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
+        module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
 
     #########################################################################################################################################################################
 
@@ -635,7 +639,7 @@ def handler():
         column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
+        module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
 
         df_mpiio_collective_writes = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
 
@@ -660,7 +664,7 @@ def handler():
         column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
+        module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
 
         #########################################################################################################################################################################
 
@@ -677,7 +681,7 @@ def handler():
         mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum()
         mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum()
 
-        check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
+        module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
 
     #########################################################################################################################################################################
 
@@ -726,7 +730,7 @@ def handler():
                         NUMBER_OF_COMPUTE_NODES = first['NNodes']
 
                         # Do we have one MPI-IO aggregator per node?
-                        check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
+                        module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
                 except StopIteration:
                     pass
         except FileNotFoundError:
@@ -793,14 +797,14 @@ def handler():
 
     console.print()
 
-    display_content(console)
-    display_thresholds(console)
-    display_footer(console, insights_start_time, insights_end_time)
+    module.display_content(console)
+    module.display_thresholds(console)
+    module.display_footer(console, insights_start_time, insights_end_time)
 
     # Export to HTML, SVG, and CSV
     trace_name = os.path.basename(darshan_log_path).replace('.darshan', '')
     out_dir = parser.args.export_dir if parser.args.export_dir != "" else os.getcwd()
 
-    export_html(console, out_dir, trace_name)
-    export_svg(console, out_dir, trace_name)
-    export_csv(out_dir, trace_name, job['job']['jobid'])
+    module.export_html(console, out_dir, trace_name)
+    module.export_svg(console, out_dir, trace_name)
+    module.export_csv(out_dir, trace_name, job['job']['jobid'])

From c2dcad74f9375f56dc70bb0fe8ea3b4e1e94fa2d Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 14 Apr 2025 17:19:41 -0600
Subject: [PATCH 18/43] fix: Update enum values in ModuleType for consistency
 with Darshan naming conventions

---
 drishti/handlers/darshan_util.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 6c9090e..c9e62f8 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -10,10 +10,11 @@
 
 class ModuleType(str, Enum):
     """Enum for standard I/O module types"""
-    POSIX = "posix"
-    STDIO = "stdio"
-    MPIIO = "mpiio"
-    
+
+    POSIX = "POSIX"
+    STDIO = "STDIO"
+    MPIIO = "MPI-IO"
+
     def __str__(self) -> str:
         return self.value
 

From 8e7c36d752621537bab95eead5e64a523f0e187e Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 14 Apr 2025 17:20:27 -0600
Subject: [PATCH 19/43] fix: Update IOStatistics to use ModuleType for sizes
 and operations dictionaries

---
 drishti/handlers/darshan_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index c9e62f8..19a02ae 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -57,8 +57,8 @@ class IOOperation(IOCounter):
 class IOStatistics:
     """Tracks both I/O sizes and operations by module with aggregated metrics"""
     # Use dicts to store module-specific data
-    sizes: Dict[Union[ModuleType, str], IOSize] = field(default_factory=dict)
-    operations: Dict[Union[ModuleType, str], IOOperation] = field(default_factory=dict)
+    sizes: Dict[ModuleType, IOSize] = field(init=True)
+    operations: Dict[ModuleType, IOOperation] = field(init=True)
 
     def __post_init__(self):
         # Initialize standard modules if not present

From 7eb8211fa5cdf06b555fb71d3c7489516fcfe448 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 16 Apr 2025 15:24:33 -0600
Subject: [PATCH 20/43] refactor: Enhance DarshanFile class with cached
 properties for improved I/O statistics and module management

---
 drishti/handlers/darshan_util.py   | 278 ++++++++++++++++++++++++++++-
 drishti/handlers/handle_darshan.py |  50 +++++-
 drishti/includes/module.py         |   3 +-
 3 files changed, 313 insertions(+), 18 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 19a02ae..f0cb5ff 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -3,9 +3,13 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from functools import cached_property
+from os import write
 from typing import Dict, Final, Optional, Union, List, Tuple, Iterable
 
+import numpy as np
 import pandas as pd
+from darshan import DarshanReport  # type: ignore
+import drishti.includes.parser as parser
 
 
 class ModuleType(str, Enum):
@@ -231,12 +235,14 @@ class AccessPatternStats:
 class DarshanFile:
     # TODO: All fields which are not calculated should be instantly populated and not optional
     # TODO: Explore using typeddicts instead of dicts
+    file_path: str
+    _darshan_report: Optional[DarshanReport] = None
     job_id: Optional[str] = None
     log_ver: Optional[str] = None
     time: Optional[TimeSpan] = None
     exe: Optional[str] = None
-    modules: Optional[Iterable[str]] = None
-    name_records: Optional[Dict[str, str]] = None
+    _modules: Optional[Iterable[str]] = None
+    _name_records: Optional[Dict[int, str]] = None  # Keys are uint64
     max_read_offset: Optional[int] = None
     max_write_offset: Optional[int] = None
     total_files_stdio: Optional[int] = None
@@ -245,20 +251,22 @@ class DarshanFile:
     files: Optional[Dict[str, str]] = None
     
     # Replace individual I/O stats with IOStatistics class
-    io_stats: Optional[IOStatistics] = None
-    
+    _io_stats: Optional[IOStatistics] = None
+
     # File counts
     total_files: Optional[int] = 0
     
     # Additional I/O statistics organized by category
-    small_io: Optional[SmallIOStats] = None
-    
+    _posix_small_io: Optional[SmallIOStats] = None
+
+    _posix_detected_small_files: Optional[pd.DataFrame] = None
+
     # Direct alignment fields instead of a class
     mem_not_aligned: Optional[int] = None
     file_not_aligned: Optional[int] = None
-    
+
     access_pattern: Optional[AccessPatternStats] = None
-    
+
     # Use separate classes for shared operations
     shared_ops: Optional[SharedOpsStats] = None
     shared_small_ops: Optional[SharedSmallOpsStats] = None
@@ -287,5 +295,259 @@ class DarshanFile:
     imbalance_count_posix_shared_time: Optional[int] = None
     posix_shared_time_imbalance_detected_files: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None
 
+    @cached_property
+    def report(self) -> DarshanReport:
+        if self._darshan_report is None:
+            self._darshan_report = DarshanReport(self.file_path)
+        return self._darshan_report
+
+    @cached_property
+    def modules(self) -> Iterable[str]:
+        if self._modules is None:
+            self._modules = set(self.report.records.keys())
+        return self._modules
+
+    @cached_property
+    def io_stats(self) -> IOStatistics:
+        if self._io_stats is None:
+            # Calculate I/O sizes
+            sizes: Dict[ModuleType, IOSize] = {}
+            ops: Dict[ModuleType, IOOperation] = {}
+            if ModuleType.STDIO in self.modules:
+                df = self.report.records[ModuleType.STDIO].to_df()
+                counters = df["counters"]
+                assert df, "STDIO module data frame is empty"
+
+                stdio_read_size = counters["STDIO_BYTES_READ"].sum()
+                stdio_write_size = counters["STDIO_BYTES_WRITTEN"].sum()
+                sizes[ModuleType.STDIO] = IOSize(
+                    read=stdio_read_size, write=stdio_write_size
+                )
+
+                stdio_read_ops = counters["STDIO_READS"].sum()
+                stdio_write_ops = counters["STDIO_WRITES"].sum()
+                ops[ModuleType.STDIO] = IOOperation(
+                    read=stdio_read_ops, write=stdio_write_ops
+                )
+
+            if ModuleType.POSIX in self.modules:
+                df = self.report.records[ModuleType.POSIX].to_df()
+                counters = df["counters"]
+                assert df, "POSIX module data frame is empty"
+
+                posix_write_size = counters["POSIX_BYTES_WRITTEN"].sum()
+                posix_read_size = counters["POSIX_BYTES_READ"].sum()
+                sizes[ModuleType.POSIX] = IOSize(
+                    read=posix_read_size, write=posix_write_size
+                )
+
+                posix_read_ops = counters["POSIX_READS"].sum()
+                posix_write_ops = counters["POSIX_WRITES"].sum()
+                ops[ModuleType.POSIX] = IOOperation(
+                    read=posix_read_ops, write=posix_write_ops
+                )
+
+            if ModuleType.MPIIO in self.modules:
+                df = self.report.records[ModuleType.MPIIO].to_df()
+                counters = df["counters"]
+                assert df, "MPIIO module data frame is empty"
+
+                mpiio_write_size = counters["MPIIO_BYTES_WRITTEN"].sum()
+                mpiio_read_size = counters["MPIIO_BYTES_READ"].sum()
+                sizes[ModuleType.MPIIO] = IOSize(
+                    read=mpiio_read_size, write=mpiio_write_size
+                )
+
+                mpiio_read_ops = -1
+                mpiio_write_ops = -1
+                ops[ModuleType.MPIIO] = IOOperation(
+                    read=mpiio_read_ops, write=mpiio_write_ops
+                )
+
+            self._io_stats = IOStatistics(sizes=sizes, operations=ops)
+        return self._io_stats
 
+    @cached_property
+    def posix_small_io(self) -> SmallIOStats:
+        if self._posix_small_io is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            posix_reads_small = (
+                posix_counters["POSIX_SIZE_READ_0_100"].sum()
+                + posix_counters["POSIX_SIZE_READ_100_1K"].sum()
+                + posix_counters["POSIX_SIZE_READ_1K_10K"].sum()
+                + posix_counters["POSIX_SIZE_READ_10K_100K"].sum()
+                + posix_counters["POSIX_SIZE_READ_100K_1M"].sum()
+            )
+            posix_writes_small = (
+                posix_counters["POSIX_SIZE_WRITE_0_100"].sum()
+                + posix_counters["POSIX_SIZE_WRITE_100_1K"].sum()
+                + posix_counters["POSIX_SIZE_WRITE_1K_10K"].sum()
+                + posix_counters["POSIX_SIZE_WRITE_10K_100K"].sum()
+                + posix_counters["POSIX_SIZE_WRITE_100K_1M"].sum()
+            )
+            self._posix_small_io = SmallIOStats(
+                read=posix_reads_small, write=posix_writes_small
+            )
+        return self._posix_small_io
+
+    @property
+    def posix_detected_small_files(self) -> pd.DataFrame:
+        if self._posix_detected_small_files is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            posix_counters["INSIGHTS_POSIX_SMALL_READ"] = (
+                posix_counters["POSIX_SIZE_READ_0_100"]
+                + posix_counters["POSIX_SIZE_READ_100_1K"]
+                + posix_counters["POSIX_SIZE_READ_1K_10K"]
+                + posix_counters["POSIX_SIZE_READ_10K_100K"]
+                + posix_counters["POSIX_SIZE_READ_100K_1M"]
+            )
+            posix_counters["INSIGHTS_POSIX_SMALL_WRITE"] = (
+                posix_counters["POSIX_SIZE_WRITE_0_100"]
+                + posix_counters["POSIX_SIZE_WRITE_100_1K"]
+                + posix_counters["POSIX_SIZE_WRITE_1K_10K"]
+                + posix_counters["POSIX_SIZE_WRITE_10K_100K"]
+                + posix_counters["POSIX_SIZE_WRITE_100K_1M"]
+            )
+            detected_files = pd.DataFrame(
+                posix_counters.groupby("id")[
+                    ["INSIGHTS_POSIX_SMALL_READ", "INSIGHTS_POSIX_SMALL_WRITE"]
+                ].sum()
+            ).reset_index()
+            detected_files.columns = pd.Index(["id", "total_reads", "total_writes"])
+            detected_files.loc[:, "id"] = detected_files.loc[:, "id"].astype(str)
+            self._posix_detected_small_files = detected_files
+        return self._posix_detected_small_files
+
+    @property
+    def file_map(self) -> Dict[int, str]:
+        return self.name_records
 
+    @cached_property
+    def name_records(self) -> Dict[int, str]:
+        if self._name_records is None:
+            self._name_records = self.report.name_records
+        return self._name_records
+
+    @property
+    def dxt_posix_df(self) -> Optional[pd.DataFrame]:
+        # TODO
+        # if parser.args.backtrace is False:
+        #     return None
+        assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
+        dxt_posix_df = pd.DataFrame(self.report.records["DXT_POSIX"].to_df())
+        return dxt_posix_df
+
+    @property
+    def dxt_posix_read_df(self) -> Optional[pd.DataFrame]:
+        if parser.args.backtrace is False:
+            return None
+        assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
+        df = self.dxt_posix_df
+        assert df is not None, "Should be handled by parser.args.backtrace check"
+
+        # TODO
+        # if "address_line_mapping" not in df:
+        #     parser.args.backtrace = False
+        #     return None
+
+        read_id = []
+        read_rank = []
+        read_length = []
+        read_offsets = []
+        read_end_time = []
+        read_start_time = []
+        read_operation = []
+
+        for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]):
+            if not r[1].empty:
+                read_id.append([r[3]] * len((r[1]["length"].to_list())))
+                read_rank.append([r[0]] * len((r[1]["length"].to_list())))
+                read_length.append(r[1]["length"].to_list())
+                read_end_time.append(r[1]["end_time"].to_list())
+                read_start_time.append(r[1]["start_time"].to_list())
+                read_operation.append(["read"] * len((r[1]["length"].to_list())))
+                read_offsets.append(r[1]["offset"].to_list())
+
+        read_id = [element for nestedlist in read_id for element in nestedlist]
+        read_rank = [element for nestedlist in read_rank for element in nestedlist]
+        read_length = [element for nestedlist in read_length for element in nestedlist]
+        read_offsets = [
+            element for nestedlist in read_offsets for element in nestedlist
+        ]
+        read_end_time = [
+            element for nestedlist in read_end_time for element in nestedlist
+        ]
+        read_operation = [
+            element for nestedlist in read_operation for element in nestedlist
+        ]
+        read_start_time = [
+            element for nestedlist in read_start_time for element in nestedlist
+        ]
+
+        dxt_posix_read_data = {
+            "id": read_id,
+            "rank": read_rank,
+            "length": read_length,
+            "end_time": read_end_time,
+            "start_time": read_start_time,
+            "operation": read_operation,
+            "offsets": read_offsets,
+        }
+
+        return pd.DataFrame(dxt_posix_read_data)
+
+    @property
+    def dxt_posix_write_df(self) -> Optional[pd.DataFrame]:
+        if parser.args.backtrace is False:
+            return None
+        assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
+        df = self.dxt_posix_df
+        assert df is not None, "Should be handled by parser.args.backtrace check"
+
+        # TODO
+        # if "address_line_mapping" not in df:
+        #     parser.args.backtrace = False
+        #     return None
+
+        write_id = []
+        write_rank = []
+        write_length = []
+        write_offsets = []
+        write_end_time = []
+        write_start_time = []
+        write_operation = []
+
+        for r in zip(df['rank'], df['read_segments'], df['write_segments'], df['id']):
+            if not r[2].empty:
+                write_id.append([r[3]] * len((r[2]['length'].to_list())))
+                write_rank.append([r[0]] * len((r[2]['length'].to_list())))
+                write_length.append(r[2]['length'].to_list())
+                write_end_time.append(r[2]['end_time'].to_list())
+                write_start_time.append(r[2]['start_time'].to_list())
+                write_operation.append(['write'] * len((r[2]['length'].to_list())))
+                write_offsets.append(r[2]['offset'].to_list())
+
+
+        write_id = [element for nestedlist in write_id for element in nestedlist]
+        write_rank = [element for nestedlist in write_rank for element in nestedlist]
+        write_length = [element for nestedlist in write_length for element in nestedlist]
+        write_offsets = [element for nestedlist in write_offsets for element in nestedlist]
+        write_end_time = [element for nestedlist in write_end_time for element in nestedlist]
+        write_operation = [element for nestedlist in write_operation for element in nestedlist]
+        write_start_time = [element for nestedlist in write_start_time for element in nestedlist]
+
+
+        dxt_posix_write_data = pd.DataFrame(
+            {
+                'id': write_id,
+                'rank': write_rank,
+                'length': write_length,
+                'end_time': write_end_time,
+                'start_time': write_start_time,
+                'operation': write_operation,
+                'offsets': write_offsets,
+            })
+
+        return pd.DataFrame(dxt_posix_write_data)
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 633238e..d58a34b 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -148,6 +148,9 @@ def handler():
 
     job = report.metadata
 
+    #########################################################################################################################################################################
+    darshan_file_obj = DarshanFile(file_path=darshan_log_path)
+
     #########################################################################################################################################################################
 
     # Check usage of STDIO, POSIX, and MPI-IO per file
@@ -205,14 +208,17 @@ def handler():
     df_lustre = None
     if "LUSTRE" in report.records:
         df_lustre = report.records['LUSTRE'].to_df()
-    
     if parser.args.backtrace:
         if "DXT_POSIX" in report.records:
             dxt_posix = report.records["DXT_POSIX"].to_df()
             dxt_posix = pd.DataFrame(dxt_posix)
-            if "address_line_mapping" not in dxt_posix:
-                parser.args.backtrace = False
+            if False:
+            # if "address_line_mapping" not in dxt_posix:
+                # parser.args.backtrace = False # TODO
+                print("Upper")
+                pass
             else:
+                print("ENTERED")
                 read_id = []
                 read_rank = []
                 read_length = []
@@ -339,8 +345,10 @@ def handler():
             'mpiio': uses_mpiio
         }
 
-    module.check_stdio(total_size, total_size_stdio)
-    module.check_mpiio(modules)
+    # module.check_stdio(total_size, total_size_stdio)
+    module.check_stdio(total_size=darshan_file_obj.io_stats.total_bytes, total_size_stdio=darshan_file_obj.io_stats.stdio_size)
+    # module.check_mpiio(modules)
+    module.check_mpiio(modules=darshan_file_obj.modules)
 
     #########################################################################################################################################################################
 
@@ -357,14 +365,24 @@ def handler():
         total_operations = total_writes + total_reads 
 
         # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
-        module.check_operation_intensive(total_operations, total_reads, total_writes)
+        # module.check_operation_intensive(total_operations, total_reads, total_writes)
+        module.check_operation_intensive(
+            total_operations=darshan_file_obj.io_stats.posix_ops,
+            total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"),
+            total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"),
+        )
 
         total_read_size = df['counters']['POSIX_BYTES_READ'].sum()
         total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum()
 
         total_size = total_written_size + total_read_size
 
-        module.check_size_intensive(total_size, total_read_size, total_written_size)
+        # module.check_size_intensive(total_size, total_read_size, total_written_size)
+        module.check_size_intensive(
+            total_size=darshan_file_obj.io_stats.total_bytes,
+            total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"),
+            total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"),
+        )
 
         #########################################################################################################################################################################
 
@@ -408,7 +426,21 @@ def handler():
         detected_files.columns = ['id', 'total_reads', 'total_writes']
         detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
 
-        module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+
+        # module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        parser.args.backtrace = True # TODO
+
+        module.check_small_operation(
+            total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"),
+            total_reads_small=darshan_file_obj.posix_small_io.read,
+            total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"),
+            total_writes_small=darshan_file_obj.posix_small_io.write,
+            detected_files=darshan_file_obj.posix_detected_small_files, modules=darshan_file_obj.modules,
+            file_map=darshan_file_obj.file_map,
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+            dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df,
+        )
 
         #########################################################################################################################################################################
 
@@ -506,7 +538,7 @@ def handler():
         module.check_long_metadata(count_long_metadata, modules)
 
         # We already have a single line for each shared-file access
-        # To check for stragglers, we can check the difference between the 
+        # To check for stragglers, we can check the difference between the
 
         # POSIX_FASTEST_RANK_BYTES
         # POSIX_SLOWEST_RANK_BYTES
diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index e7f70d6..52fac10 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -4,6 +4,7 @@
 import datetime
 import os
 import time
+import typing
 
 import pandas as pd
 from rich import box
@@ -73,7 +74,7 @@ def check_stdio(total_size, total_size_stdio):
         )
 
 
-def check_mpiio(modules):
+def check_mpiio(modules: typing.Iterable[str]):
     """
     Check whether the application has used MPI-IO or not
 

From 0ff644b56c29e86ed204065d3448d2b88f46657b Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 16 Apr 2025 15:33:35 -0600
Subject: [PATCH 21/43] hotfix: Remove TODO comments and implement backtrace
 checks for address line mapping in DXT_POSIX data handling

---
 drishti/handlers/darshan_util.py   | 19 ++++++++-----------
 drishti/handlers/handle_darshan.py | 10 ++--------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index f0cb5ff..4b1d3b2 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -432,9 +432,8 @@ def name_records(self) -> Dict[int, str]:
 
     @property
     def dxt_posix_df(self) -> Optional[pd.DataFrame]:
-        # TODO
-        # if parser.args.backtrace is False:
-        #     return None
+        if parser.args.backtrace is False:
+            return None
         assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
         dxt_posix_df = pd.DataFrame(self.report.records["DXT_POSIX"].to_df())
         return dxt_posix_df
@@ -447,10 +446,9 @@ def dxt_posix_read_df(self) -> Optional[pd.DataFrame]:
         df = self.dxt_posix_df
         assert df is not None, "Should be handled by parser.args.backtrace check"
 
-        # TODO
-        # if "address_line_mapping" not in df:
-        #     parser.args.backtrace = False
-        #     return None
+        if "address_line_mapping" not in df:
+            parser.args.backtrace = False
+            return None
 
         read_id = []
         read_rank = []
@@ -506,10 +504,9 @@ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]:
         df = self.dxt_posix_df
         assert df is not None, "Should be handled by parser.args.backtrace check"
 
-        # TODO
-        # if "address_line_mapping" not in df:
-        #     parser.args.backtrace = False
-        #     return None
+        if "address_line_mapping" not in df:
+            parser.args.backtrace = False
+            return None
 
         write_id = []
         write_rank = []
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index d58a34b..22ac827 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -212,13 +212,9 @@ def handler():
         if "DXT_POSIX" in report.records:
             dxt_posix = report.records["DXT_POSIX"].to_df()
             dxt_posix = pd.DataFrame(dxt_posix)
-            if False:
-            # if "address_line_mapping" not in dxt_posix:
-                # parser.args.backtrace = False # TODO
-                print("Upper")
-                pass
+            if "address_line_mapping" not in dxt_posix:
+                parser.args.backtrace = False
             else:
-                print("ENTERED")
                 read_id = []
                 read_rank = []
                 read_length = []
@@ -428,8 +424,6 @@ def handler():
 
 
         # module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
-        parser.args.backtrace = True # TODO
-
         module.check_small_operation(
             total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"),
             total_reads_small=darshan_file_obj.posix_small_io.read,

From 9951500f82c3f3527853b73edf6b94190a23e503 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 16 Apr 2025 15:40:41 -0600
Subject: [PATCH 22/43] refactor: Introduce cached properties for memory and
 file alignment checks in Darshan data handling

---
 drishti/handlers/darshan_util.py   | 28 +++++++++++++++++++++++++---
 drishti/handlers/handle_darshan.py | 14 ++++++++++++--
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 4b1d3b2..831713b 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -262,8 +262,8 @@ class DarshanFile:
     _posix_detected_small_files: Optional[pd.DataFrame] = None
 
     # Direct alignment fields instead of a class
-    mem_not_aligned: Optional[int] = None
-    file_not_aligned: Optional[int] = None
+    _mem_not_aligned: Optional[int] = None
+    _file_not_aligned: Optional[int] = None
 
     access_pattern: Optional[AccessPatternStats] = None
 
@@ -547,4 +547,26 @@ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]:
                 'offsets': write_offsets,
             })
 
-        return pd.DataFrame(dxt_posix_write_data)
\ No newline at end of file
+        return pd.DataFrame(dxt_posix_write_data)
+
+    @cached_property
+    def mem_not_aligned(self) -> int:
+        if self._mem_not_aligned is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._mem_not_aligned = posix_counters['POSIX_MEM_NOT_ALIGNED'].sum()
+        return self._mem_not_aligned
+
+    @cached_property
+    def file_not_aligned(self) -> int:
+        if self._file_not_aligned is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._file_not_aligned = posix_counters['POSIX_FILE_NOT_ALIGNED'].sum()
+        return self._file_not_aligned
+
+    @property
+    def lustre_df(self) -> Optional[pd.DataFrame]:
+        if "LUSTRE" not in self.modules:
+            return None
+        return pd.DataFrame(self.report.records["LUSTRE"].to_df())
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 22ac827..aa803d9 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -358,7 +358,7 @@ def handler():
         total_writes = df['counters']['POSIX_WRITES'].sum()
 
         # Get total number of I/O operations
-        total_operations = total_writes + total_reads 
+        total_operations = total_writes + total_reads
 
         # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
         # module.check_operation_intensive(total_operations, total_reads, total_writes)
@@ -443,7 +443,17 @@ def handler():
         total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum()
         total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
 
-        module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
+        # module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
+        module.check_misaligned(
+            total_operations=darshan_file_obj.io_stats.posix_ops,
+            total_mem_not_aligned=darshan_file_obj.mem_not_aligned,
+            total_file_not_aligned=darshan_file_obj.file_not_aligned,
+            modules=darshan_file_obj.modules,
+            file_map=darshan_file_obj.file_map,
+            df_lustre=darshan_file_obj.lustre_df,
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+        )
 
         #########################################################################################################################################################################
 

From 58f4301842b872872f861395590ca898a7c24ff0 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 16 Apr 2025 15:48:17 -0600
Subject: [PATCH 23/43] refactor: Implement cached properties for max read and
 write offsets in Darshan data handling

---
 drishti/handlers/darshan_util.py   | 20 ++++++++++++++++++--
 drishti/handlers/handle_darshan.py | 11 ++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 831713b..7b90609 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -243,8 +243,8 @@ class DarshanFile:
     exe: Optional[str] = None
     _modules: Optional[Iterable[str]] = None
     _name_records: Optional[Dict[int, str]] = None  # Keys are uint64
-    max_read_offset: Optional[int] = None
-    max_write_offset: Optional[int] = None
+    _max_read_offset: Optional[int] = None
+    _max_write_offset: Optional[int] = None
     total_files_stdio: Optional[int] = None
     total_files_posix: Optional[int] = None
     total_files_mpiio: Optional[int] = None
@@ -570,3 +570,19 @@ def lustre_df(self) -> Optional[pd.DataFrame]:
         if "LUSTRE" not in self.modules:
             return None
         return pd.DataFrame(self.report.records["LUSTRE"].to_df())
+
+    @cached_property
+    def max_read_offset(self) -> int:
+        if self._max_read_offset is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._max_read_offset = posix_counters['POSIX_MAX_BYTE_READ'].max()
+        return self._max_read_offset
+
+    @cached_property
+    def max_write_offset(self) -> int:
+        if self._max_write_offset is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._max_write_offset = posix_counters['POSIX_MAX_BYTE_WRITTEN'].max()
+        return self._max_write_offset
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index aa803d9..de6d7ee 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -462,7 +462,16 @@ def handler():
         max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max()
         max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
 
-        module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        # module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_traffic(
+            max_read_offset=darshan_file_obj.max_read_offset,
+            total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"),
+            max_write_offset=darshan_file_obj.max_write_offset,
+            total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"),
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+            dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df,
+        )
 
         #########################################################################################################################################################################
 

From d567b72e35f0068cf95f70a01ab1ebf236e5deee Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 18 Apr 2025 12:50:54 -0600
Subject: [PATCH 24/43] refactor: Add cached properties for POSIX read and
 write operation statistics in Darshan data handling

---
 drishti/handlers/darshan_util.py   | 57 +++++++++++++++++++++++++++++-
 drishti/handlers/handle_darshan.py | 25 ++++++++++++-
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 7b90609..2654d93 100644
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -265,6 +265,13 @@ class DarshanFile:
     _mem_not_aligned: Optional[int] = None
     _file_not_aligned: Optional[int] = None
 
+    _posix_read_consecutive: Optional[int] = None
+    _posix_write_consecutive: Optional[int] = None
+    _posix_read_sequential: Optional[int] = None
+    _posix_write_sequential: Optional[int] = None
+    _posix_read_random: Optional[int] = None
+    _posix_write_random: Optional[int] = None
+
     access_pattern: Optional[AccessPatternStats] = None
 
     # Use separate classes for shared operations
@@ -585,4 +592,52 @@ def max_write_offset(self) -> int:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
             self._max_write_offset = posix_counters['POSIX_MAX_BYTE_WRITTEN'].max()
-        return self._max_write_offset
\ No newline at end of file
+        return self._max_write_offset
+
+    @cached_property
+    def posix_read_consecutive(self) -> int:
+        if self._posix_read_consecutive is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._posix_read_consecutive = posix_counters['POSIX_CONSEC_READS'].sum()
+        return self._posix_read_consecutive
+
+    @cached_property
+    def posix_write_consecutive(self) -> int:
+        if self._posix_write_consecutive is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._posix_write_consecutive = posix_counters['POSIX_CONSEC_WRITES'].sum()
+        return self._posix_write_consecutive
+
+    @cached_property
+    def posix_read_sequential(self) -> int:
+        if self._posix_read_sequential is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._posix_read_sequential = posix_counters['POSIX_SEQ_READS'].sum() - self.posix_read_consecutive
+        return self._posix_read_sequential
+
+    @cached_property
+    def posix_write_sequential(self) -> int:
+        if self._posix_write_sequential is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._posix_write_sequential = posix_counters['POSIX_SEQ_WRITES'].sum() - self.posix_write_consecutive
+        return self._posix_write_sequential
+
+    @cached_property
+    def posix_read_random(self) -> int:
+        if self._posix_read_random is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._posix_read_random = self.io_stats.get_module_ops(ModuleType.POSIX, "read") - self.posix_read_consecutive - self.posix_read_sequential
+        return self._posix_read_random
+
+    @cached_property
+    def posix_write_random(self) -> int:
+        if self._posix_write_random is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential
+        return self._posix_write_random
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index de6d7ee..4c6bc6b 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -496,7 +496,30 @@ def handler():
         write_random = total_writes - write_consecutive - write_sequential
         #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
 
-        module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+
+        assert read_consecutive == darshan_file_obj.posix_read_consecutive
+        assert read_sequential == darshan_file_obj.posix_read_sequential
+        assert read_random == darshan_file_obj.posix_read_random, f"{read_random} != {darshan_file_obj.posix_read_random}"
+        assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read")}"
+        assert write_consecutive == darshan_file_obj.posix_write_consecutive
+        assert write_sequential == darshan_file_obj.posix_write_sequential
+        assert write_random == darshan_file_obj.posix_write_random
+        assert total_writes == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write")
+
+        # module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_random_operation(
+            read_consecutive=darshan_file_obj.posix_read_consecutive,
+            read_sequential=darshan_file_obj.posix_read_sequential,
+            read_random=darshan_file_obj.posix_read_random,
+            total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"),
+            write_consecutive=darshan_file_obj.posix_write_consecutive,
+            write_sequential=darshan_file_obj.posix_write_sequential,
+            write_random=darshan_file_obj.posix_write_random,
+            total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write"),
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+            dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df,
+        )
 
         #########################################################################################################################################################################
 

From 97e049c74b8bcae9ab0bc54271b54bfb65a31a49 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sun, 27 Apr 2025 15:13:03 -0600
Subject: [PATCH 25/43] refactor: Add cached property for long metadata count
 in POSIX data handling

---
 drishti/handlers/darshan_util.py   | 13 ++++++++++++-
 drishti/handlers/handle_darshan.py |  6 +++++-
 2 files changed, 17 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 drishti/handlers/darshan_util.py

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
old mode 100644
new mode 100755
index 2654d93..176c7ea
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -10,6 +10,7 @@
 import pandas as pd
 from darshan import DarshanReport  # type: ignore
 import drishti.includes.parser as parser
+import drishti.includes.config as config
 
 
 class ModuleType(str, Enum):
@@ -272,6 +273,8 @@ class DarshanFile:
     _posix_read_random: Optional[int] = None
     _posix_write_random: Optional[int] = None
 
+    _posix_long_metadata_count: Optional[int] = None
+
     access_pattern: Optional[AccessPatternStats] = None
 
     # Use separate classes for shared operations
@@ -640,4 +643,12 @@ def posix_write_random(self) -> int:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
             self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential
-        return self._posix_write_random
\ No newline at end of file
+        return self._posix_write_random
+
+    @cached_property
+    def posix_long_metadata_count(self) -> int:
+        if self._posix_long_metadata_count is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_long_metadata_rows = posix_df['fcounters'][(posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])]
+            self._posix_long_metadata_count = len(posix_long_metadata_rows)
+        return self._posix_long_metadata_count
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 4c6bc6b..579b9f9 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -530,6 +530,7 @@ def handler():
         shared_files = shared_files.assign(id=lambda d: d['id'].astype(str))
 
         if not shared_files.empty:
+            # TODO: This entire conditional
             total_shared_reads = shared_files['POSIX_READS'].sum()
             total_shared_reads_small = (
                 shared_files['POSIX_SIZE_READ_0_100'].sum() +
@@ -571,7 +572,10 @@ def handler():
 
         count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])])
 
-        module.check_long_metadata(count_long_metadata, modules)
+        assert darshan_file_obj.posix_long_metadata_count == count_long_metadata
+        assert darshan_file_obj.modules == modules.keys(), f"{darshan_file_obj.modules} != {modules.keys()}"
+        # module.check_long_metadata(count_long_metadata, modules)
+        module.check_long_metadata(count_long_metadata=darshan_file_obj.posix_long_metadata_count, modules=darshan_file_obj.modules)
 
         # We already have a single line for each shared-file access
         # To check for stragglers, we can check the difference between the

From 88c1f54ed5b520ea5d8bd2a8c79365debfa42a4b Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sun, 27 Apr 2025 15:27:16 -0600
Subject: [PATCH 26/43] refactor: Add cached properties for shared read and
 write operations and stragglers count in Darshan data handling

---
 drishti/handlers/darshan_util.py   | 62 +++++++++++++++++++++++++++++-
 drishti/handlers/handle_darshan.py | 20 +++++++++-
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 176c7ea..599405d 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -274,11 +274,12 @@ class DarshanFile:
     _posix_write_random: Optional[int] = None
 
     _posix_long_metadata_count: Optional[int] = None
+    _posix_stragglers_count: Optional[int] = None
 
     access_pattern: Optional[AccessPatternStats] = None
 
     # Use separate classes for shared operations
-    shared_ops: Optional[SharedOpsStats] = None
+    _shared_ops: Optional[SharedOpsStats] = None
     shared_small_ops: Optional[SharedSmallOpsStats] = None
 
     count_long_metadata: Optional[int] = None
@@ -645,6 +646,36 @@ def posix_write_random(self) -> int:
             self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential
         return self._posix_write_random
 
+    @property
+    def posix_shared_files_df(self) -> pd.DataFrame:
+        assert "POSIX" in self.modules, "Missing POSIX module"
+        posix_df = self.report.records[ModuleType.POSIX].to_df()
+        shared_files_df = posix_df['counters'].loc[(posix_df['counters']['rank'] == -1)]
+        shared_files_df = shared_files_df.assign(id=lambda d: d['id'].astype(str))
+        return shared_files_df
+
+    @cached_property
+    def posix_shared_reads(self) -> int:
+        if self._shared_ops is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._shared_ops = SharedOpsStats(
+                read=posix_counters["POSIX_SHARED_READS"].sum(),
+                write=posix_counters["POSIX_SHARED_WRITES"].sum(),
+            )
+        return self._shared_ops.read
+
+    @cached_property
+    def posix_shared_writes(self) -> int:
+        if self._shared_ops is None:
+            posix_df = self.report.records[ModuleType.POSIX].to_df()
+            posix_counters = posix_df["counters"]
+            self._shared_ops = SharedOpsStats(
+                read=posix_counters["POSIX_SHARED_READS"].sum(),
+                write=posix_counters["POSIX_SHARED_WRITES"].sum(),
+            )
+        return self._shared_ops.write
+
     @cached_property
     def posix_long_metadata_count(self) -> int:
         if self._posix_long_metadata_count is None:
@@ -652,3 +683,32 @@ def posix_long_metadata_count(self) -> int:
             posix_long_metadata_rows = posix_df['fcounters'][(posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])]
             self._posix_long_metadata_count = len(posix_long_metadata_rows)
         return self._posix_long_metadata_count
+
+    @property
+    def posix_stragglers_df(self) -> pd.DataFrame:
+        shared_files = self.posix_shared_files_df
+
+        detected_files = []
+
+        for index, row in shared_files.iterrows():
+            total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ']
+
+            if total_transfer_size and abs(
+                    row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \
+                    config.thresholds['imbalance_stragglers'][0]:
+                # stragglers_count += 1
+
+                detected_files.append([
+                    row['id'],
+                    abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100
+                ])
+
+        column_names = ['id', 'data_imbalance']
+        detected_files = pd.DataFrame(detected_files, columns=column_names)
+        return  detected_files
+
+    @cached_property
+    def posix_stragglers_count(self) -> int:
+        if self._posix_stragglers_count is None:
+            self._posix_stragglers_count = len(self.posix_stragglers_df)
+        return self._posix_stragglers_count
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 579b9f9..ec54012 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -566,6 +566,9 @@ def handler():
                 shared_files['POSIX_SIZE_WRITE_100K_1M']
             )
 
+            # module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
+            assert total_shared_reads == darshan_file_obj.posix_shared_reads
+            sys.exit(2)
             module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
 
         #########################################################################################################################################################################
@@ -603,7 +606,22 @@ def handler():
 
         column_names = ['id', 'data_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        assert stragglers_count == darshan_file_obj.posix_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_stragglers_count}"
+        assert detected_files.equals(darshan_file_obj.posix_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_stragglers_df}"
+        assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+        assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
+        assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
+        assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}"
+        # module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        module.check_shared_data_imblance(
+            stragglers_count=darshan_file_obj.posix_stragglers_count,
+            detected_files=darshan_file_obj.posix_stragglers_df,
+            file_map=darshan_file_obj.file_map,
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_read_data = darshan_file_obj.dxt_posix_read_df,
+            dxt_posix_write_data = darshan_file_obj.dxt_posix_write_df
+        )
+        sys.exit(2)
 
         # POSIX_F_FASTEST_RANK_TIME
         # POSIX_F_SLOWEST_RANK_TIME

From 489ee4d54cf8acb0f208f0905a3357bd45aa8acc Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sun, 27 Apr 2025 15:28:39 -0600
Subject: [PATCH 27/43] fmt: `ruff format drishti/handlers/darshan_util.py`

---
 drishti/handlers/darshan_util.py | 204 ++++++++++++++++++++++---------
 1 file changed, 145 insertions(+), 59 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 599405d..3cfb5d4 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -31,11 +31,15 @@ class TimeSpan:
 
     def __post_init__(self):
         if self.start > self.end:
-            raise ValueError(f"TimeSpan start ({self.start}) must be <= end ({self.end})")
+            raise ValueError(
+                f"TimeSpan start ({self.start}) must be <= end ({self.end})"
+            )
+
 
 @dataclass
 class IOCounter:
     """Base class for I/O metrics with read/write counts"""
+
     read: Final[int] = field(init=True)
     write: Final[int] = field(init=True)
     _total: Optional[int] = None
@@ -47,20 +51,25 @@ def total(self) -> int:
             return self._total
         return self.read + self.write
 
+
 @dataclass
 class IOSize(IOCounter):
     """Represents I/O size statistics in bytes"""
+
     pass
 
+
 @dataclass
 class IOOperation(IOCounter):
     """Represents I/O operation count statistics"""
+
     pass
 
 
 @dataclass
 class IOStatistics:
     """Tracks both I/O sizes and operations by module with aggregated metrics"""
+
     # Use dicts to store module-specific data
     sizes: Dict[ModuleType, IOSize] = field(init=True)
     operations: Dict[ModuleType, IOOperation] = field(init=True)
@@ -69,7 +78,9 @@ def __post_init__(self):
         # Initialize standard modules if not present
         for module in ModuleType:
             # Ensure that the module is either in both sizes and operations or in neither
-            assert (module in self.sizes) == (module in self.operations), f"Module {module} should be in both sizes and operations or in neither"
+            assert (module in self.sizes) == (module in self.operations), (
+                f"Module {module} should be in both sizes and operations or in neither"
+            )
 
             if module not in self.sizes:
                 self.sizes[module] = IOSize(read=0, write=0)
@@ -134,11 +145,15 @@ def total_ops(self) -> int:
         return self.reads + self.writes
 
     # Methods to get stats for specific modules
-    def get_module_size(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int:
+    def get_module_size(
+        self,
+        module: Optional[Union[ModuleType, str]] = None,
+        data_type: Optional[str] = "total",
+    ) -> int:
         """Get size statistics for a specific module or all modules if not specified."""
         if module is None and data_type is None:
             raise ValueError("Both module and data_type cannot be None")
-            
+
         if module:
             if module not in self.sizes:
                 raise ValueError(f"Module {module} not found in sizes")
@@ -157,11 +172,15 @@ def get_module_size(self, module: Optional[Union[ModuleType, str]] = None, data_
             else:  # data_type is None or "total"
                 return self.total_bytes
 
-    def get_module_ops(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int:
+    def get_module_ops(
+        self,
+        module: Optional[Union[ModuleType, str]] = None,
+        data_type: Optional[str] = "total",
+    ) -> int:
         """Get operation statistics for a specific module or all modules if not specified."""
         if module is None and data_type is None:
             raise ValueError("Both module and data_type cannot be None")
-            
+
         if module:
             if module not in self.operations:
                 raise ValueError(f"Module {module} not found in operations")
@@ -180,57 +199,84 @@ def get_module_ops(self, module: Optional[Union[ModuleType, str]] = None, data_t
             else:  # data_type is None or "total"
                 return self.total_ops
 
+
 @dataclass
 class SmallIOStats(IOCounter):
     """Statistics for small I/O operations"""
+
     pass  # Inherits read/write/total from IOCounter
 
+
 @dataclass
 class SharedOpsStats(IOCounter):
     """Statistics for shared file operations"""
+
     pass  # Inherits read/write/total from IOCounter
 
+
 @dataclass
 class SharedSmallOpsStats(IOCounter):
     """Statistics for small shared file operations"""
+
     pass  # Inherits read/write/total from IOCounter
 
+
 @dataclass
 class ConsecutiveIOStats(IOCounter):
     """Statistics for consecutive I/O operations"""
+
     pass  # Inherits read/write/total from IOCounter
 
+
 @dataclass
 class SequentialIOStats(IOCounter):
     """Statistics for sequential I/O operations"""
+
     pass  # Inherits read/write/total from IOCounter
 
+
 @dataclass
 class RandomIOStats(IOCounter):
     """Statistics for random I/O operations"""
+
     pass  # Inherits read/write/total from IOCounter
 
+
 @dataclass
 class MPIIONonBlockingStats(IOCounter):
     """Statistics for non-blocking MPI I/O operations"""
+
     pass
 
+
 @dataclass
 class MPICollectiveIOStats(IOCounter):
     """Statistics for collective MPI I/O operations"""
+
     pass
 
+
 @dataclass
 class MPIIndependentIOStats(IOCounter):
     """Statistics for independent MPI I/O operations"""
+
     pass
 
+
 @dataclass
 class AccessPatternStats:
     """Statistics for I/O access patterns by pattern type"""
-    consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True)
-    sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0), init=True)
-    random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0), init=True)
+
+    consecutive: ConsecutiveIOStats = field(
+        default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True
+    )
+    sequential: SequentialIOStats = field(
+        default_factory=lambda: SequentialIOStats(read=0, write=0), init=True
+    )
+    random: RandomIOStats = field(
+        default_factory=lambda: RandomIOStats(read=0, write=0), init=True
+    )
+
 
 @dataclass
 class DarshanFile:
@@ -250,13 +296,13 @@ class DarshanFile:
     total_files_posix: Optional[int] = None
     total_files_mpiio: Optional[int] = None
     files: Optional[Dict[str, str]] = None
-    
+
     # Replace individual I/O stats with IOStatistics class
     _io_stats: Optional[IOStatistics] = None
 
     # File counts
     total_files: Optional[int] = 0
-    
+
     # Additional I/O statistics organized by category
     _posix_small_io: Optional[SmallIOStats] = None
 
@@ -304,7 +350,9 @@ class DarshanFile:
     detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None
 
     imbalance_count_posix_shared_time: Optional[int] = None
-    posix_shared_time_imbalance_detected_files: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None
+    posix_shared_time_imbalance_detected_files: Optional[
+        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
+    ] = None
 
     @cached_property
     def report(self) -> DarshanReport:
@@ -527,36 +575,45 @@ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]:
         write_start_time = []
         write_operation = []
 
-        for r in zip(df['rank'], df['read_segments'], df['write_segments'], df['id']):
+        for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]):
             if not r[2].empty:
-                write_id.append([r[3]] * len((r[2]['length'].to_list())))
-                write_rank.append([r[0]] * len((r[2]['length'].to_list())))
-                write_length.append(r[2]['length'].to_list())
-                write_end_time.append(r[2]['end_time'].to_list())
-                write_start_time.append(r[2]['start_time'].to_list())
-                write_operation.append(['write'] * len((r[2]['length'].to_list())))
-                write_offsets.append(r[2]['offset'].to_list())
-
+                write_id.append([r[3]] * len((r[2]["length"].to_list())))
+                write_rank.append([r[0]] * len((r[2]["length"].to_list())))
+                write_length.append(r[2]["length"].to_list())
+                write_end_time.append(r[2]["end_time"].to_list())
+                write_start_time.append(r[2]["start_time"].to_list())
+                write_operation.append(["write"] * len((r[2]["length"].to_list())))
+                write_offsets.append(r[2]["offset"].to_list())
 
         write_id = [element for nestedlist in write_id for element in nestedlist]
         write_rank = [element for nestedlist in write_rank for element in nestedlist]
-        write_length = [element for nestedlist in write_length for element in nestedlist]
-        write_offsets = [element for nestedlist in write_offsets for element in nestedlist]
-        write_end_time = [element for nestedlist in write_end_time for element in nestedlist]
-        write_operation = [element for nestedlist in write_operation for element in nestedlist]
-        write_start_time = [element for nestedlist in write_start_time for element in nestedlist]
-
+        write_length = [
+            element for nestedlist in write_length for element in nestedlist
+        ]
+        write_offsets = [
+            element for nestedlist in write_offsets for element in nestedlist
+        ]
+        write_end_time = [
+            element for nestedlist in write_end_time for element in nestedlist
+        ]
+        write_operation = [
+            element for nestedlist in write_operation for element in nestedlist
+        ]
+        write_start_time = [
+            element for nestedlist in write_start_time for element in nestedlist
+        ]
 
         dxt_posix_write_data = pd.DataFrame(
             {
-                'id': write_id,
-                'rank': write_rank,
-                'length': write_length,
-                'end_time': write_end_time,
-                'start_time': write_start_time,
-                'operation': write_operation,
-                'offsets': write_offsets,
-            })
+                "id": write_id,
+                "rank": write_rank,
+                "length": write_length,
+                "end_time": write_end_time,
+                "start_time": write_start_time,
+                "operation": write_operation,
+                "offsets": write_offsets,
+            }
+        )
 
         return pd.DataFrame(dxt_posix_write_data)
 
@@ -565,7 +622,7 @@ def mem_not_aligned(self) -> int:
         if self._mem_not_aligned is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._mem_not_aligned = posix_counters['POSIX_MEM_NOT_ALIGNED'].sum()
+            self._mem_not_aligned = posix_counters["POSIX_MEM_NOT_ALIGNED"].sum()
         return self._mem_not_aligned
 
     @cached_property
@@ -573,7 +630,7 @@ def file_not_aligned(self) -> int:
         if self._file_not_aligned is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._file_not_aligned = posix_counters['POSIX_FILE_NOT_ALIGNED'].sum()
+            self._file_not_aligned = posix_counters["POSIX_FILE_NOT_ALIGNED"].sum()
         return self._file_not_aligned
 
     @property
@@ -587,7 +644,7 @@ def max_read_offset(self) -> int:
         if self._max_read_offset is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._max_read_offset = posix_counters['POSIX_MAX_BYTE_READ'].max()
+            self._max_read_offset = posix_counters["POSIX_MAX_BYTE_READ"].max()
         return self._max_read_offset
 
     @cached_property
@@ -595,7 +652,7 @@ def max_write_offset(self) -> int:
         if self._max_write_offset is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._max_write_offset = posix_counters['POSIX_MAX_BYTE_WRITTEN'].max()
+            self._max_write_offset = posix_counters["POSIX_MAX_BYTE_WRITTEN"].max()
         return self._max_write_offset
 
     @cached_property
@@ -603,7 +660,7 @@ def posix_read_consecutive(self) -> int:
         if self._posix_read_consecutive is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._posix_read_consecutive = posix_counters['POSIX_CONSEC_READS'].sum()
+            self._posix_read_consecutive = posix_counters["POSIX_CONSEC_READS"].sum()
         return self._posix_read_consecutive
 
     @cached_property
@@ -611,7 +668,7 @@ def posix_write_consecutive(self) -> int:
         if self._posix_write_consecutive is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._posix_write_consecutive = posix_counters['POSIX_CONSEC_WRITES'].sum()
+            self._posix_write_consecutive = posix_counters["POSIX_CONSEC_WRITES"].sum()
         return self._posix_write_consecutive
 
     @cached_property
@@ -619,7 +676,9 @@ def posix_read_sequential(self) -> int:
         if self._posix_read_sequential is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._posix_read_sequential = posix_counters['POSIX_SEQ_READS'].sum() - self.posix_read_consecutive
+            self._posix_read_sequential = (
+                posix_counters["POSIX_SEQ_READS"].sum() - self.posix_read_consecutive
+            )
         return self._posix_read_sequential
 
     @cached_property
@@ -627,7 +686,9 @@ def posix_write_sequential(self) -> int:
         if self._posix_write_sequential is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._posix_write_sequential = posix_counters['POSIX_SEQ_WRITES'].sum() - self.posix_write_consecutive
+            self._posix_write_sequential = (
+                posix_counters["POSIX_SEQ_WRITES"].sum() - self.posix_write_consecutive
+            )
         return self._posix_write_sequential
 
     @cached_property
@@ -635,7 +696,11 @@ def posix_read_random(self) -> int:
         if self._posix_read_random is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._posix_read_random = self.io_stats.get_module_ops(ModuleType.POSIX, "read") - self.posix_read_consecutive - self.posix_read_sequential
+            self._posix_read_random = (
+                self.io_stats.get_module_ops(ModuleType.POSIX, "read")
+                - self.posix_read_consecutive
+                - self.posix_read_sequential
+            )
         return self._posix_read_random
 
     @cached_property
@@ -643,15 +708,19 @@ def posix_write_random(self) -> int:
         if self._posix_write_random is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
             posix_counters = posix_df["counters"]
-            self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential
+            self._posix_write_random = (
+                self.io_stats.get_module_ops(ModuleType.POSIX, "write")
+                - self.posix_write_consecutive
+                - self.posix_write_sequential
+            )
         return self._posix_write_random
 
     @property
     def posix_shared_files_df(self) -> pd.DataFrame:
         assert "POSIX" in self.modules, "Missing POSIX module"
         posix_df = self.report.records[ModuleType.POSIX].to_df()
-        shared_files_df = posix_df['counters'].loc[(posix_df['counters']['rank'] == -1)]
-        shared_files_df = shared_files_df.assign(id=lambda d: d['id'].astype(str))
+        shared_files_df = posix_df["counters"].loc[(posix_df["counters"]["rank"] == -1)]
+        shared_files_df = shared_files_df.assign(id=lambda d: d["id"].astype(str))
         return shared_files_df
 
     @cached_property
@@ -680,7 +749,12 @@ def posix_shared_writes(self) -> int:
     def posix_long_metadata_count(self) -> int:
         if self._posix_long_metadata_count is None:
             posix_df = self.report.records[ModuleType.POSIX].to_df()
-            posix_long_metadata_rows = posix_df['fcounters'][(posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])]
+            posix_long_metadata_rows = posix_df["fcounters"][
+                (
+                    posix_df["fcounters"]["POSIX_F_META_TIME"]
+                    > config.thresholds["metadata_time_rank"][0]
+                )
+            ]
             self._posix_long_metadata_count = len(posix_long_metadata_rows)
         return self._posix_long_metadata_count
 
@@ -691,24 +765,36 @@ def posix_stragglers_df(self) -> pd.DataFrame:
         detected_files = []
 
         for index, row in shared_files.iterrows():
-            total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ']
+            total_transfer_size = row["POSIX_BYTES_WRITTEN"] + row["POSIX_BYTES_READ"]
 
-            if total_transfer_size and abs(
-                    row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \
-                    config.thresholds['imbalance_stragglers'][0]:
+            if (
+                total_transfer_size
+                and abs(
+                    row["POSIX_SLOWEST_RANK_BYTES"] - row["POSIX_FASTEST_RANK_BYTES"]
+                )
+                / total_transfer_size
+                > config.thresholds["imbalance_stragglers"][0]
+            ):
                 # stragglers_count += 1
 
-                detected_files.append([
-                    row['id'],
-                    abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100
-                ])
+                detected_files.append(
+                    [
+                        row["id"],
+                        abs(
+                            row["POSIX_SLOWEST_RANK_BYTES"]
+                            - row["POSIX_FASTEST_RANK_BYTES"]
+                        )
+                        / total_transfer_size
+                        * 100,
+                    ]
+                )
 
-        column_names = ['id', 'data_imbalance']
+        column_names = ["id", "data_imbalance"]
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        return  detected_files
+        return detected_files
 
     @cached_property
     def posix_stragglers_count(self) -> int:
         if self._posix_stragglers_count is None:
             self._posix_stragglers_count = len(self.posix_stragglers_df)
-        return self._posix_stragglers_count
\ No newline at end of file
+        return self._posix_stragglers_count

From 719693edcc3607eda625f35d884d00cd63aeb2e2 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sun, 27 Apr 2025 15:40:25 -0600
Subject: [PATCH 28/43] hotfix: Rename posix_stragglers to
 posix_data_stragglers

---
 drishti/handlers/darshan_util.py   | 12 ++++++------
 drishti/handlers/handle_darshan.py |  9 ++++-----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 3cfb5d4..ee71997 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -320,7 +320,7 @@ class DarshanFile:
     _posix_write_random: Optional[int] = None
 
     _posix_long_metadata_count: Optional[int] = None
-    _posix_stragglers_count: Optional[int] = None
+    _posix_data_stragglers_count: Optional[int] = None
 
     access_pattern: Optional[AccessPatternStats] = None
 
@@ -759,7 +759,7 @@ def posix_long_metadata_count(self) -> int:
         return self._posix_long_metadata_count
 
     @property
-    def posix_stragglers_df(self) -> pd.DataFrame:
+    def posix_data_stragglers_df(self) -> pd.DataFrame:
         shared_files = self.posix_shared_files_df
 
         detected_files = []
@@ -794,7 +794,7 @@ def posix_stragglers_df(self) -> pd.DataFrame:
         return detected_files
 
     @cached_property
-    def posix_stragglers_count(self) -> int:
-        if self._posix_stragglers_count is None:
-            self._posix_stragglers_count = len(self.posix_stragglers_df)
-        return self._posix_stragglers_count
+    def posix_data_stragglers_count(self) -> int:
+        if self._posix_data_stragglers_count is None:
+            self._posix_data_stragglers_count = len(self.posix_data_stragglers_df)
+        return self._posix_data_stragglers_count
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index ec54012..67deb1b 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -606,22 +606,21 @@ def handler():
 
         column_names = ['id', 'data_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        assert stragglers_count == darshan_file_obj.posix_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_stragglers_count}"
-        assert detected_files.equals(darshan_file_obj.posix_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_stragglers_df}"
+        assert stragglers_count == darshan_file_obj.posix_data_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_data_stragglers_count}"
+        assert detected_files.equals(darshan_file_obj.posix_data_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_data_stragglers_df}"
         assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
         assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
         assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
         assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}"
         # module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
         module.check_shared_data_imblance(
-            stragglers_count=darshan_file_obj.posix_stragglers_count,
-            detected_files=darshan_file_obj.posix_stragglers_df,
+            stragglers_count=darshan_file_obj.posix_data_stragglers_count,
+            detected_files=darshan_file_obj.posix_data_stragglers_df,
             file_map=darshan_file_obj.file_map,
             dxt_posix=darshan_file_obj.dxt_posix_df,
             dxt_posix_read_data = darshan_file_obj.dxt_posix_read_df,
             dxt_posix_write_data = darshan_file_obj.dxt_posix_write_df
         )
-        sys.exit(2)
 
         # POSIX_F_FASTEST_RANK_TIME
         # POSIX_F_SLOWEST_RANK_TIME

From 668b1e123171596ff525a2d62f1fb7e6c2d126da Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sun, 27 Apr 2025 15:50:24 -0600
Subject: [PATCH 29/43] refactor: Add cached properties for POSIX time
 stragglers count and DataFrame

---
 drishti/handlers/darshan_util.py   | 39 ++++++++++++++++++++++++++++++
 drishti/handlers/handle_darshan.py | 13 +++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index ee71997..bca2800 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -321,6 +321,7 @@ class DarshanFile:
 
     _posix_long_metadata_count: Optional[int] = None
     _posix_data_stragglers_count: Optional[int] = None
+    _posix_time_stragglers_count: Optional[int] = None
 
     access_pattern: Optional[AccessPatternStats] = None
 
@@ -798,3 +799,41 @@ def posix_data_stragglers_count(self) -> int:
         if self._posix_data_stragglers_count is None:
             self._posix_data_stragglers_count = len(self.posix_data_stragglers_df)
         return self._posix_data_stragglers_count
+
+    @property
+    def posix_time_stragglers_df(self) -> pd.DataFrame:
+        df = self.report.records[ModuleType.POSIX].to_df()
+
+        shared_files_times = df['fcounters'].loc[(df['fcounters']['rank'] == -1)]
+
+        # Get the files responsible
+        detected_files = []
+
+        # stragglers_count = 0
+        # stragglers_imbalance = {}
+
+        shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str))
+
+        for index, row in shared_files_times.iterrows():
+            total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME']
+
+            if total_transfer_time and abs(
+                    row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \
+                    config.thresholds['imbalance_stragglers'][0]:
+                # stragglers_count += 1
+
+                detected_files.append([
+                    row['id'],
+                    abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100
+                ])
+
+        column_names = ['id', 'time_imbalance']
+        detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+        return detected_files
+
+    @cached_property
+    def posix_time_stragglers_count(self) -> int:
+        if self._posix_time_stragglers_count is None:
+            self._posix_time_stragglers_count = len(self.posix_time_stragglers_df)
+        return self._posix_time_stragglers_count
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 67deb1b..e036e88 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -648,7 +648,18 @@ def handler():
 
         column_names = ['id', 'time_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        module.check_shared_time_imbalance(stragglers_count, detected_files, file_map)
+
+        assert stragglers_count == darshan_file_obj.posix_time_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_time_stragglers_count}"
+        assert detected_files.equals(darshan_file_obj.posix_time_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_time_stragglers_df}"
+        assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+
+        # module.check_shared_time_imbalance(stragglers_count, detected_files, file_map)
+        module.check_shared_time_imbalance(
+            stragglers_count=darshan_file_obj.posix_time_stragglers_count,
+            detected_files=darshan_file_obj.posix_time_stragglers_df,
+            file_map=darshan_file_obj.file_map,
+        )
+        sys.exit(2)
 
         aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
             ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']

From f7fe2c41c12889c696259c1f25630150cd30dedd Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sun, 27 Apr 2025 15:59:22 -0600
Subject: [PATCH 30/43] refactor: Add cached property for write imbalance count
 and corresponding DataFrame

---
 drishti/handlers/darshan_util.py   | 43 ++++++++++++++++++++++++++++++
 drishti/handlers/handle_darshan.py | 19 +++++++++++--
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index bca2800..2f9a2ad 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -322,6 +322,7 @@ class DarshanFile:
     _posix_long_metadata_count: Optional[int] = None
     _posix_data_stragglers_count: Optional[int] = None
     _posix_time_stragglers_count: Optional[int] = None
+    _posix_write_imbalance_count: Optional[int] = None
 
     access_pattern: Optional[AccessPatternStats] = None
 
@@ -837,3 +838,45 @@ def posix_time_stragglers_count(self) -> int:
         if self._posix_time_stragglers_count is None:
             self._posix_time_stragglers_count = len(self.posix_time_stragglers_df)
         return self._posix_time_stragglers_count
+
+    @property
+    def posix_write_imbalance_df(self) -> pd.DataFrame:
+        df = self.report.records[ModuleType.POSIX].to_df()
+
+        aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
+            ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
+        ].groupby('id', as_index=False).agg({
+            'rank': 'nunique',
+            'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'],
+            'POSIX_BYTES_READ': ['sum', 'min', 'max']
+        })
+
+        aggregated.columns = list(map('_'.join, aggregated.columns.values))
+
+        aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str))
+
+        # Get the files responsible
+        imbalance_count = 0
+
+        detected_files = []
+
+        for index, row in aggregated.iterrows():
+            if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \
+                    row['POSIX_BYTES_WRITTEN_max'] > config.thresholds['imbalance_size'][0]:
+                imbalance_count += 1
+
+                detected_files.append([
+                    row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[
+                        'POSIX_BYTES_WRITTEN_max'] * 100
+                ])
+
+        column_names = ['id', 'write_imbalance']
+        detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+        return detected_files
+
+    @cached_property
+    def posix_write_imbalance_count(self) -> int:
+        if self._posix_write_imbalance_count is None:
+            self._posix_write_imbalance_count = len(self.posix_write_imbalance_df)
+        return self._posix_write_imbalance_count
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index e036e88..57a1e03 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -659,7 +659,6 @@ def handler():
             detected_files=darshan_file_obj.posix_time_stragglers_df,
             file_map=darshan_file_obj.file_map,
         )
-        sys.exit(2)
 
         aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
             ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
@@ -688,7 +687,23 @@ def handler():
 
         column_names = ['id', 'write_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
+
+        assert imbalance_count == darshan_file_obj.posix_write_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_write_imbalance_count}"
+        assert detected_files.equals(darshan_file_obj.posix_write_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_write_imbalance_df}"
+        assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+        assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
+        assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
+        assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}"
+
+        # module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
+        module.check_individual_write_imbalance(
+            imbalance_count=darshan_file_obj.posix_write_imbalance_count,
+            detected_files=darshan_file_obj.posix_write_imbalance_df,
+            file_map=darshan_file_obj.file_map,
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df
+        )
+        sys.exit(2)
 
         imbalance_count = 0
 

From 90fcf227326b9338164ba99283e5e11c443e6941 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 28 Apr 2025 14:22:10 -0600
Subject: [PATCH 31/43] refactor: Add cached property for read imbalance count
 and corresponding DataFrame

---
 drishti/handlers/darshan_util.py   | 43 ++++++++++++++++++++++++++++++
 drishti/handlers/handle_darshan.py | 18 +++++++++++--
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 2f9a2ad..64075ee 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -323,6 +323,7 @@ class DarshanFile:
     _posix_data_stragglers_count: Optional[int] = None
     _posix_time_stragglers_count: Optional[int] = None
     _posix_write_imbalance_count: Optional[int] = None
+    _posix_read_imbalance_count: Optional[int] = None
 
     access_pattern: Optional[AccessPatternStats] = None
 
@@ -880,3 +881,45 @@ def posix_write_imbalance_count(self) -> int:
         if self._posix_write_imbalance_count is None:
             self._posix_write_imbalance_count = len(self.posix_write_imbalance_df)
         return self._posix_write_imbalance_count
+
+    @property
+    def posix_read_imbalance_df(self) -> pd.DataFrame:
+        df = self.report.records[ModuleType.POSIX].to_df()
+
+        aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
+            ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
+        ].groupby('id', as_index=False).agg({
+            'rank': 'nunique',
+            'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'],
+            'POSIX_BYTES_READ': ['sum', 'min', 'max']
+        })
+
+        aggregated.columns = list(map('_'.join, aggregated.columns.values))
+
+        aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str))
+
+
+        imbalance_count = 0
+
+        detected_files = []
+
+        for index, row in aggregated.iterrows():
+            if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[
+                'POSIX_BYTES_READ_max'] > config.thresholds['imbalance_size'][0]:
+                imbalance_count += 1
+
+                detected_files.append([
+                    row['id'],
+                    abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
+                ])
+
+        column_names = ['id', 'read_imbalance']
+        detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+        return detected_files
+
+    @cached_property
+    def posix_read_imbalance_count(self) -> int:
+        if self._posix_read_imbalance_count is None:
+            self._posix_read_imbalance_count = len(self.posix_read_imbalance_df)
+        return self._posix_read_imbalance_count
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 57a1e03..b576781 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -703,7 +703,6 @@ def handler():
             dxt_posix=darshan_file_obj.dxt_posix_df,
             dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df
         )
-        sys.exit(2)
 
         imbalance_count = 0
 
@@ -719,7 +718,22 @@ def handler():
 
         column_names = ['id', 'read_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
+
+        assert imbalance_count == darshan_file_obj.posix_read_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_read_imbalance_count}"
+        assert detected_files.equals(darshan_file_obj.posix_read_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_read_imbalance_df}"
+        assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+        assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
+        assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
+
+        # module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
+        module.check_individual_read_imbalance(
+            imbalance_count=darshan_file_obj.posix_read_imbalance_count,
+            detected_files=darshan_file_obj.posix_read_imbalance_df,
+            file_map=darshan_file_obj.file_map,
+            dxt_posix=darshan_file_obj.dxt_posix_df,
+            dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df
+        )
+        sys.exit(2)
 
     #########################################################################################################################################################################
 

From e8bece15aca7f25611d17cbe9fb866c324e7f9d9 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 28 Apr 2025 15:06:08 -0600
Subject: [PATCH 32/43] hotfix: Store MPI-IO ops in `DarshanFile.io_stats`

---
 drishti/handlers/darshan_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 64075ee..34c6658 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -420,8 +420,8 @@ def io_stats(self) -> IOStatistics:
                     read=mpiio_read_size, write=mpiio_write_size
                 )
 
-                mpiio_read_ops = -1
-                mpiio_write_ops = -1
+                mpiio_read_ops = counters['MPIIO_INDEP_READS'].sum() + counters['MPIIO_COLL_READS'].sum()
+                mpiio_write_ops = counters['MPIIO_INDEP_WRITES'].sum() + counters['MPIIO_COLL_WRITES'].sum()
                 ops[ModuleType.MPIIO] = IOOperation(
                     read=mpiio_read_ops, write=mpiio_write_ops
                 )

From 411ad9f3ede92f1ee3c6ecc0cecb3805691873bd Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 28 Apr 2025 15:38:55 -0600
Subject: [PATCH 33/43] refactor: Add cached properties for MPI collective and
 independent I/O operations

---
 drishti/handlers/darshan_util.py   | 63 ++++++++++++++++++++++++++++--
 drishti/handlers/handle_darshan.py | 23 ++++++++++-
 2 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 34c6658..88e1aeb 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -346,8 +346,8 @@ class DarshanFile:
 
     aggregated: Optional[pd.DataFrame] = None
 
-    mpi_coll_ops: Optional[MPICollectiveIOStats] = None
-    mpi_indep_ops: Optional[MPIIndependentIOStats] = None
+    _mpi_coll_ops: Optional[MPICollectiveIOStats] = None
+    _mpi_indep_ops: Optional[MPIIndependentIOStats] = None
 
     detected_files_mpi_coll_reads: Optional[pd.DataFrame] = None
     detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None
@@ -922,4 +922,61 @@ def posix_read_imbalance_df(self) -> pd.DataFrame:
     def posix_read_imbalance_count(self) -> int:
         if self._posix_read_imbalance_count is None:
             self._posix_read_imbalance_count = len(self.posix_read_imbalance_df)
-        return self._posix_read_imbalance_count
\ No newline at end of file
+        return self._posix_read_imbalance_count
+
+    @cached_property
+    def mpi_coll_ops(self) -> MPICollectiveIOStats:
+        if self._mpi_coll_ops is None:
+            mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+            mpi_coll_reads = mpi_df['counters']['MPIIO_COLL_READS'].sum()
+            mpiio_coll_writes = mpi_df['counters']['MPIIO_COLL_WRITES'].sum()
+            self._mpi_coll_ops = MPICollectiveIOStats(read=mpi_coll_reads, write=mpiio_coll_writes)
+        return self._mpi_coll_ops
+
+    @cached_property
+    def mpi_indep_ops(self) -> MPIIndependentIOStats:
+        if self._mpi_indep_ops is None:
+            mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+            mpi_indep_reads = mpi_df['counters']['MPIIO_INDEP_READS'].sum()
+            mpi_indep_writes = mpi_df['counters']['MPIIO_INDEP_WRITES'].sum()
+            self._mpi_indep_ops = MPIIndependentIOStats(read=mpi_indep_reads, write=mpi_indep_writes)
+        return self._mpi_indep_ops
+
+    @property
+    def mpi_read_df(self) -> pd.DataFrame:
+        mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+        counters = mpi_df['counters']
+        mpi_coll_reads = self.mpi_coll_ops.read
+        mpi_total_reads = self.io_stats.get_module_ops(ModuleType.MPIIO, "read")
+
+        detected_files = []
+
+        if mpi_coll_reads == 0 and mpi_total_reads and mpi_total_reads > \
+                config.thresholds['collective_operations_absolute'][0]:
+            files = pd.DataFrame(counters.groupby('id').sum()).reset_index()
+            for index, row in counters.iterrows():
+                if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
+                        row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        config.thresholds['collective_operations'][0] and
+                        (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        config.thresholds['collective_operations_absolute'][0]):
+                    detected_files.append([
+                        row['id'], row['MPIIO_INDEP_READS'],
+                        row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
+                    ])
+
+        column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
+        detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+        return detected_files
+
+    @property
+    def dxt_mpi_df(self) -> Optional[pd.DataFrame]:
+        if not parser.args.backtrace:
+            return None
+        if "DXT_MPIIO" not in self.modules:
+            return None
+
+        dxt_mpiio = self.report.records["DXT_MPIIO"].to_df()
+        dxt_mpiio = pd.DataFrame(dxt_mpiio)
+        return dxt_mpiio
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index b576781..f61e8ca 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -733,7 +733,6 @@ def handler():
             dxt_posix=darshan_file_obj.dxt_posix_df,
             dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df
         )
-        sys.exit(2)
 
     #########################################################################################################################################################################
 
@@ -768,7 +767,27 @@ def handler():
         column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
+        assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}"
+        assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}"
+        assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read")}"
+        assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}"
+        assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+        if dxt_mpiio is None:
+            assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+            assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}"
+        else:
+            assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+
+        # module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
+        module.check_mpi_collective_read_operation(
+            mpiio_coll_reads=darshan_file_obj.mpi_coll_ops.read,
+            mpiio_indep_reads=darshan_file_obj.mpi_indep_ops.read,
+            total_mpiio_read_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"),
+            detected_files=darshan_file_obj.mpi_read_df,
+            file_map=darshan_file_obj.file_map,
+            dxt_mpiio=darshan_file_obj.dxt_mpi_df
+        )
+        sys.exit(2)
 
         df_mpiio_collective_writes = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
 

From 232f049281164e7ebe4c349c075835b7fbfa8723 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 28 Apr 2025 17:47:08 -0600
Subject: [PATCH 34/43] refactor: Add mpi_read_df property and enhance
 assertions for independent writes

---
 drishti/handlers/darshan_util.py   | 32 ++++++++++++++++++++++++++++-
 drishti/handlers/handle_darshan.py | 33 +++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 88e1aeb..aad3dff 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -979,4 +979,34 @@ def dxt_mpi_df(self) -> Optional[pd.DataFrame]:
 
         dxt_mpiio = self.report.records["DXT_MPIIO"].to_df()
         dxt_mpiio = pd.DataFrame(dxt_mpiio)
-        return dxt_mpiio
\ No newline at end of file
+        return dxt_mpiio
+
+    @property
+    def mpi_write_df(self) -> pd.DataFrame:
+        mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+        counters = mpi_df['counters']
+
+        mpi_coll_writes = self.mpi_coll_ops.write
+        total_mpiio_write_operations = self.io_stats.get_module_ops(ModuleType.MPIIO, "write")
+
+
+        detected_files = []
+        if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \
+                config.thresholds['collective_operations_absolute'][0]:
+            files = pd.DataFrame(counters.groupby('id').sum()).reset_index()
+
+            for index, row in counters.iterrows():
+                if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
+                        row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        config.thresholds['collective_operations'][0] and
+                        (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        config.thresholds['collective_operations_absolute'][0]):
+                    detected_files.append([
+                        row['id'], row['MPIIO_INDEP_WRITES'],
+                        row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
+                    ])
+
+        column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
+        detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+        return detected_files
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index f61e8ca..5381d4f 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -770,7 +770,11 @@ def handler():
         assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}"
         assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}"
         assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read")}"
-        assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}"
+        if detected_files.empty:
+            assert detected_files.empty, f"{detected_files} != {darshan_file_obj.mpi_read_df}"
+            assert darshan_file_obj.mpi_read_df.empty, f"{darshan_file_obj.mpi_read_df} != {detected_files}"
+        else:
+            assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}"
         assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
         if dxt_mpiio is None:
             assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
@@ -787,7 +791,6 @@ def handler():
             file_map=darshan_file_obj.file_map,
             dxt_mpiio=darshan_file_obj.dxt_mpi_df
         )
-        sys.exit(2)
 
         df_mpiio_collective_writes = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
 
@@ -812,7 +815,31 @@ def handler():
         column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
+        assert mpiio_indep_writes == darshan_file_obj.mpi_indep_ops.write, f"{mpiio_indep_writes} != {darshan_file_obj.mpi_indep_ops.write}"
+        assert mpiio_coll_writes == darshan_file_obj.mpi_coll_ops.write, f"{mpiio_coll_writes} != {darshan_file_obj.mpi_coll_ops.write}"
+        assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write")}"
+        if detected_files.empty:
+            assert detected_files.empty, f"{detected_files} !={darshan_file_obj.mpi_write_df}"
+            assert darshan_file_obj.mpi_write_df.empty, f"{darshan_file_obj.mpi_write_df} != {detected_files}"
+        else:
+            assert detected_files.equals(darshan_file_obj.mpi_write_df), f"{detected_files} != {darshan_file_obj.mpi_write_df}"
+        assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+        if dxt_mpiio is None:
+            assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+            assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}"
+        else:
+            assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+
+        # module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
+        module.check_mpi_collective_write_operation(
+            mpiio_coll_writes=darshan_file_obj.mpi_coll_ops.write,
+            mpiio_indep_writes=darshan_file_obj.mpi_indep_ops.write,
+            total_mpiio_write_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"),
+            detected_files=darshan_file_obj.mpi_write_df,
+            file_map=darshan_file_obj.file_map,
+            dxt_mpiio=darshan_file_obj.dxt_mpi_df,
+        )
+        sys.exit(2)
 
         #########################################################################################################################################################################
 

From 1dfe5e05302b251044cab6f8667a03c7d16d0360 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 30 Apr 2025 18:06:27 -0600
Subject: [PATCH 35/43] refactor: Add cached properties for MPI non-blocking
 I/O operations and HDF5 extension check

---
 drishti/handlers/darshan_util.py   | 26 +++++++++++++++++++++++---
 drishti/handlers/handle_darshan.py | 15 +++++++++++++--
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index aad3dff..3afa740 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -334,9 +334,9 @@ class DarshanFile:
     count_long_metadata: Optional[int] = None
     posix_shared_data_imbalance_stragglers_count: Optional[int] = None
 
-    has_hdf5_extension: Optional[bool] = None
+    _has_hdf5_extension: Optional[bool] = None
 
-    mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None
+    _mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None
 
     cb_nodes: Optional[int] = None
     number_of_compute_nodes: Optional[int] = None
@@ -1009,4 +1009,24 @@ def mpi_write_df(self) -> pd.DataFrame:
         column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        return detected_files
\ No newline at end of file
+        return detected_files
+
+    @cached_property
+    def mpiio_nb_ops(self) -> MPIIONonBlockingStats:
+        if self._mpiio_nb_ops is None:
+            mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+            mpi_nb_reads = mpi_df['counters']['MPIIO_NB_READS'].sum()
+            mpi_nb_writes = mpi_df['counters']['MPIIO_NB_WRITES'].sum()
+            self._mpiio_nb_ops = MPIIONonBlockingStats(read=mpi_nb_reads, write=mpi_nb_writes)
+        return self._mpiio_nb_ops
+
+    @cached_property
+    def has_hdf5_extension(self) -> bool:
+        if self._has_hdf5_extension is None:
+            self._has_hdf5_extension = False
+            mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+            for index, row in mpi_df['counters'].iterrows():
+                if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'):
+                    self._has_hdf5_extension = True
+                    # break
+        return self._has_hdf5_extension
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 5381d4f..2ffcacd 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -839,7 +839,6 @@ def handler():
             file_map=darshan_file_obj.file_map,
             dxt_mpiio=darshan_file_obj.dxt_mpi_df,
         )
-        sys.exit(2)
 
         #########################################################################################################################################################################
 
@@ -856,7 +855,19 @@ def handler():
         mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum()
         mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum()
 
-        module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
+        assert mpiio_nb_reads == darshan_file_obj.mpiio_nb_ops.read
+        assert mpiio_nb_writes == darshan_file_obj.mpiio_nb_ops.write
+        assert modules.keys() == darshan_file_obj.modules, f"{modules.keys()} != {darshan_file_obj.modules}"
+        assert has_hdf5_extension == darshan_file_obj.has_hdf5_extension, f"{has_hdf5_extension} != {darshan_file_obj.has_hdf5_extension}"
+
+        # module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
+        module.check_mpi_none_block_operation(
+            mpiio_nb_reads=darshan_file_obj.mpiio_nb_ops.read,
+            mpiio_nb_writes=darshan_file_obj.mpiio_nb_ops.write,
+            has_hdf5_extension=darshan_file_obj.has_hdf5_extension,
+            modules=darshan_file_obj.modules,
+        )
+        sys.exit(2)
 
     #########################################################################################################################################################################
 

From 2d625287b5c0e082cdd9fa90016a4ed9f700f665 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 5 May 2025 16:51:27 -0600
Subject: [PATCH 36/43] chore: Add Python version 3.8 to the project

---
 .python-version | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .python-version

diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..cc1923a
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.8

From 0ec5e9f4cbb1797b7224716b903612adf8edd653 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 5 May 2025 16:53:13 -0600
Subject: [PATCH 37/43] hotfix: Fix f-string formatting in assertions for MPI
 I/O operations

Required for Python 3.8 compatability
---
 drishti/handlers/handle_darshan.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 2ffcacd..12cdf48 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -500,7 +500,7 @@ def handler():
         assert read_consecutive == darshan_file_obj.posix_read_consecutive
         assert read_sequential == darshan_file_obj.posix_read_sequential
         assert read_random == darshan_file_obj.posix_read_random, f"{read_random} != {darshan_file_obj.posix_read_random}"
-        assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read")}"
+        assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, 'read')}"
         assert write_consecutive == darshan_file_obj.posix_write_consecutive
         assert write_sequential == darshan_file_obj.posix_write_sequential
         assert write_random == darshan_file_obj.posix_write_random
@@ -769,7 +769,7 @@ def handler():
 
         assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}"
         assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}"
-        assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read")}"
+        assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'read')}"
         if detected_files.empty:
             assert detected_files.empty, f"{detected_files} != {darshan_file_obj.mpi_read_df}"
             assert darshan_file_obj.mpi_read_df.empty, f"{darshan_file_obj.mpi_read_df} != {detected_files}"
@@ -817,7 +817,7 @@ def handler():
 
         assert mpiio_indep_writes == darshan_file_obj.mpi_indep_ops.write, f"{mpiio_indep_writes} != {darshan_file_obj.mpi_indep_ops.write}"
         assert mpiio_coll_writes == darshan_file_obj.mpi_coll_ops.write, f"{mpiio_coll_writes} != {darshan_file_obj.mpi_coll_ops.write}"
-        assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write")}"
+        assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'write')}"
         if detected_files.empty:
             assert detected_files.empty, f"{detected_files} !={darshan_file_obj.mpi_write_df}"
             assert darshan_file_obj.mpi_write_df.empty, f"{darshan_file_obj.mpi_write_df} != {detected_files}"

From ab905650245782cfff0f4eedb0cd91653c31fa20 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 5 May 2025 16:54:16 -0600
Subject: [PATCH 38/43] refactor: Add cached properties for compute nodes and
 enhance assertions in Darshan handling

---
 drishti/handlers/darshan_util.py   | 57 ++++++++++++++++++++++++++++--
 drishti/handlers/handle_darshan.py |  9 +++--
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 3afa740..5b90891 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -1,9 +1,14 @@
+import csv
 import datetime
+import io
+import subprocess
+import sys
 import typing
 from dataclasses import dataclass, field
 from enum import Enum
 from functools import cached_property
 from os import write
+from shlex import shlex
 from typing import Dict, Final, Optional, Union, List, Tuple, Iterable
 
 import numpy as np
@@ -338,8 +343,8 @@ class DarshanFile:
 
     _mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None
 
-    cb_nodes: Optional[int] = None
-    number_of_compute_nodes: Optional[int] = None
+    _cb_nodes: Optional[int] = None
+    _number_of_compute_nodes: Optional[int] = None
     hints: Optional[List[str]] = None
 
     timestamp: Optional[TimeSpan] = None
@@ -1030,3 +1035,51 @@ def has_hdf5_extension(self) -> bool:
                     self._has_hdf5_extension = True
                     # break
         return self._has_hdf5_extension
+
+    @cached_property
+    def cb_nodes(self) -> int:
+        if self._cb_nodes is None:
+            assert ModuleType.MPIIO in self.modules, "Missing MPIIO module"
+            hints = ""
+            if 'h' in self.report.metadata['job']['metadata']:
+                hints = self.report.metadata['job']['metadata']['h']
+            if hints:
+                hints = hints.split(';')
+
+            cb_nodes = None
+
+            for hint in hints:
+                if hint != 'no':
+                    (key, value) = hint.split('=')
+
+                if key == 'cb_nodes':
+                    cb_nodes = value
+        return self._cb_nodes
+
+    @cached_property
+    def number_of_compute_nodes(self) -> int:
+        if self._number_of_compute_nodes is None:
+            assert ModuleType.MPIIO in self.modules, "Missing MPIIO module"
+            command = 'sacct --job {} --format=JobID,JobIDRaw,NNodes,NCPUs --parsable2 --delimiter ","'.format(
+                self.report.metadata['job']['jobid']
+            )
+            arguments = shlex.split(command)
+
+            try:
+                result = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+                if result.returncode == 0:
+                    # We have successfully fetched the information from SLURM
+                    db = csv.DictReader(io.StringIO(result.stdout.decode('utf-8')))
+
+                    try:
+                        first = next(db)
+
+                        if 'NNodes' in first:
+                            self._number_of_compute_nodes = first['NNodes']
+
+                    except StopIteration:
+                        pass
+            except FileNotFoundError:
+                pass
+        return self._number_of_compute_nodes
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 12cdf48..b5d166f 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -867,7 +867,6 @@ def handler():
             has_hdf5_extension=darshan_file_obj.has_hdf5_extension,
             modules=darshan_file_obj.modules,
         )
-        sys.exit(2)
 
     #########################################################################################################################################################################
 
@@ -915,8 +914,14 @@ def handler():
                     if 'NNodes' in first:
                         NUMBER_OF_COMPUTE_NODES = first['NNodes']
 
+                        assert cb_nodes == darshan_file_obj.cb_nodes, f"{cb_nodes} != {darshan_file_obj.cb_nodes}"
+                        assert NUMBER_OF_COMPUTE_NODES == darshan_file_obj.number_of_compute_nodes, f"{NUMBER_OF_COMPUTE_NODES} != {darshan_file_obj.number_of_compute_nodes}"
                         # Do we have one MPI-IO aggregator per node?
-                        module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
+                        # module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
+                        module.check_mpi_aggregator(
+                            cb_nodes=darshan_file_obj.cb_nodes,
+                            NUMBER_OF_COMPUTE_NODES=darshan_file_obj.number_of_compute_nodes
+                        )
                 except StopIteration:
                     pass
         except FileNotFoundError:

From a92723ac5adb920873b44f7eaa7ad07e69324ab9 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Mon, 5 May 2025 16:54:55 -0600
Subject: [PATCH 39/43] hotfix: Enhance lustre_df method to assert single data
 frame and return components

---
 drishti/handlers/darshan_util.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index 5b90891..f3ec671 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -645,7 +645,10 @@ def file_not_aligned(self) -> int:
     def lustre_df(self) -> Optional[pd.DataFrame]:
         if "LUSTRE" not in self.modules:
             return None
-        return pd.DataFrame(self.report.records["LUSTRE"].to_df())
+        lustre_dict = self.report.records["LUSTRE"].to_df()
+        assert len(lustre_dict) == 1, f"Expected 1 data frame for LUSTRE, got {len(self.report.records['LUSTRE'].to_df())}"
+        lustre_df = lustre_dict["components"]
+        return lustre_df
 
     @cached_property
     def max_read_offset(self) -> int:

From d611255d48b7120d5e2a670a989aee0f1ceb6321 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Thu, 8 May 2025 10:36:11 -0600
Subject: [PATCH 40/43] hotfix: Handle KeyError in lustre_df method for
 backward compatibility with older PyDarshan versions

---
 drishti/handlers/darshan_util.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index f3ec671..e5b6ca4 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -647,7 +647,11 @@ def lustre_df(self) -> Optional[pd.DataFrame]:
             return None
         lustre_dict = self.report.records["LUSTRE"].to_df()
         assert len(lustre_dict) == 1, f"Expected 1 data frame for LUSTRE, got {len(self.report.records['LUSTRE'].to_df())}"
-        lustre_df = lustre_dict["components"]
+        try:
+            lustre_df = lustre_dict["components"]
+        except KeyError:
+            # Using an older PyDarshan version
+            lustre_df = lustre_dict["counters"]
         return lustre_df
 
     @cached_property

From 3142d26a68df5990fa816fb4d8b8b42199f3d3b1 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Thu, 8 May 2025 10:43:16 -0600
Subject: [PATCH 41/43] fix: preserve uint64 ID type in HDF5 extension check

Avoids using `iterrows()` which implicitly casts Darshan record IDs to float64,
potentially breaking file_map lookups for large IDs. Replaced with `itertuples()`
to maintain original dtype and re-enabled `break` for early exit.
---
 drishti/handlers/darshan_util.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
index e5b6ca4..43d9cd3 100755
--- a/drishti/handlers/darshan_util.py
+++ b/drishti/handlers/darshan_util.py
@@ -1037,10 +1037,12 @@ def has_hdf5_extension(self) -> bool:
         if self._has_hdf5_extension is None:
             self._has_hdf5_extension = False
             mpi_df = self.report.records[ModuleType.MPIIO].to_df()
-            for index, row in mpi_df['counters'].iterrows():
-                if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'):
+            # for index, row in mpi_df['counters'].iterrows(): # Implicitly converts all data to np.float64. Problematic for id (np.uint64)
+            for row in mpi_df['counters'].itertuples(index=False):
+                # if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'):
+                if self.file_map[row.id].endswith('.h5') or self.file_map[row.id].endswith('.hdf5'):
                     self._has_hdf5_extension = True
-                    # break
+                    break
         return self._has_hdf5_extension
 
     @cached_property

From 4afa525f45bf1720755c084f1c363e343d168b30 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Thu, 8 May 2025 10:47:33 -0600
Subject: [PATCH 42/43] chore(ide): add PyCharm run configurations for
 reporter.py with sample logs

Adds Sample_1 and Sample_2 run configs to easily test `reporter.py` on
benchmark Darshan logs using the drishti-io module environment.
---
 .idea/runConfigurations/Sample_1.xml | 26 ++++++++++++++++++++++++++
 .idea/runConfigurations/Sample_2.xml | 26 ++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 .idea/runConfigurations/Sample_1.xml
 create mode 100644 .idea/runConfigurations/Sample_2.xml

diff --git a/.idea/runConfigurations/Sample_1.xml b/.idea/runConfigurations/Sample_1.xml
new file mode 100644
index 0000000..0bc3377
--- /dev/null
+++ b/.idea/runConfigurations/Sample_1.xml
@@ -0,0 +1,26 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Sample 1" type="PythonConfigurationType" factoryName="Python">
+    <module name="drishti-io" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="SDK_NAME" value="uv (drishti-io)" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+    <option name="IS_MODULE_SDK" value="false" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/drishti/reporter.py" />
+    <option name="PARAMETERS" value="sample/jlbez_8a_benchmark_write_parallel_id1321662_8-21-5892-15802854900629188750_106.darshan" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>
\ No newline at end of file
diff --git a/.idea/runConfigurations/Sample_2.xml b/.idea/runConfigurations/Sample_2.xml
new file mode 100644
index 0000000..3c03139
--- /dev/null
+++ b/.idea/runConfigurations/Sample_2.xml
@@ -0,0 +1,26 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Sample 2" type="PythonConfigurationType" factoryName="Python">
+    <module name="drishti-io" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="SDK_NAME" value="uv (drishti-io)" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+    <option name="IS_MODULE_SDK" value="false" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/drishti/reporter.py" />
+    <option name="PARAMETERS" value="sample/jlbez_8a_benchmark_write_parallel_id1322696_8-21-14519-8141979180909667175_12.darshan" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>
\ No newline at end of file

From fb97e28b8f3305c043a285c2890f6bbc9f88b554 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Thu, 15 May 2025 15:25:14 -0600
Subject: [PATCH 43/43] hotfix: update total size calculation in
 check_size_intensive for POSIX module

---
 drishti/handlers/handle_darshan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index b5d166f..86dcf6f 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -375,7 +375,7 @@ def handler():
 
         # module.check_size_intensive(total_size, total_read_size, total_written_size)
         module.check_size_intensive(
-            total_size=darshan_file_obj.io_stats.total_bytes,
+            total_size=darshan_file_obj.io_stats.posix_size,
             total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"),
             total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"),
         )