From c3da3e213e0a89b2704351a875a6cfb2fc205d75 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 17 Mar 2025 22:59:12 -0600 Subject: [PATCH 01/43] fmt: `ruff format dristi/includes/module.py` --- drishti/includes/module.py | 2003 +++++++++++++++++++++++------------- 1 file changed, 1314 insertions(+), 689 deletions(-) diff --git a/drishti/includes/module.py b/drishti/includes/module.py index 9c2df16..c0d91ef 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -8,130 +8,178 @@ from rich.syntax import Syntax from drishti.includes.config import * -''' +""" Before calling the functions below Make sure the variables passed are in the given structure: file_map: a dict of (id, path) pair modules: a set or a dict should be ok detected_files: A pandas dataframe -''' +""" # Basic usage check + def check_stdio(total_size, total_size_stdio): - ''' + """ Check whether the application has excessively utilized standard input/output operations Parameters: total_size: total I/O size total_size_stdio: total STDIO size - - ''' - - if total_size and total_size_stdio / total_size > thresholds['interface_stdio'][0]: - thresholds['interface_stdio'][1] = True - issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( - total_size_stdio / total_size * 100.0, - convert_bytes(total_size_stdio) + + """ + + if total_size and total_size_stdio / total_size > thresholds["interface_stdio"][0]: + thresholds["interface_stdio"][1] = True + issue = "Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})".format( + total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio) ) recommendation = [ { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + "message": "Consider switching to a high-performance I/O interface such as MPI-IO" } ] insights_operation.append( - message(INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation + ) ) def check_mpiio(modules): - ''' + """ Check whether the application has used MPI-IO or not Parameter: modules: all different mudules been used in the application - ''' + """ - if 'MPI-IO' not in modules: - issue = 'Application is using low-performance interface' + if "MPI-IO" not in modules: + issue = "Application is using low-performance interface" recommendation = [ { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + "message": "Consider switching to a high-performance I/O interface such as MPI-IO" } ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message( + INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation + ) ) - # POSIX level check def check_operation_intensive(total_operations, total_reads, total_writes): - ''' + """ Check whether the application is read or write intensive Parameters: total_operations: number of I/O operations been executed by the application total_reads: number of read operations been executed by the application total_writes: number of write operations been executed by the application - ''' - - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: - issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + """ + + if ( + total_writes > total_reads + and total_operations + and abs(total_writes - total_reads) / total_operations + > thresholds["imbalance_operations"][0] + ): + issue = "Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format( + total_writes / total_operations * 100.0, + total_reads / total_operations * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, + TARGET_DEVELOPER, + INFO, + issue, + None, + ) ) - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: - issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + if ( + total_reads > total_writes + and total_operations + and abs(total_writes - total_reads) / total_operations + > thresholds["imbalance_operations"][0] + ): + issue = "Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format( + total_writes / total_operations * 100.0, + total_reads / total_operations * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None + ) ) def check_size_intensive(total_size, total_read_size, total_written_size): - ''' + """ Check whether the application is read size intensive or written size intensive Parameters: total_size: Total I/O size measured in byte total_read_size: Input I/O size measured in byte total_written_size: Output I/O size measured in byte - ''' - - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: - issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + """ + + if ( + total_written_size > total_read_size + and abs(total_written_size - total_read_size) / total_size + > thresholds["imbalance_operations"][0] + ): + issue = "Application is write size intensive ({:.2f}% write vs. {:.2f}% read)".format( + total_written_size / total_size * 100.0, + total_read_size / total_size * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None + ) ) - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: - issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + if ( + total_read_size > total_written_size + and abs(total_written_size - total_read_size) / total_size + > thresholds["imbalance_operations"][0] + ): + issue = "Application is read size intensive ({:.2f}% write vs. {:.2f}% read)".format( + total_written_size / total_size * 100.0, + total_read_size / total_size * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None + ) ) -def check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_small_operation( + total_reads, + total_reads_small, + total_writes, + total_writes_small, + detected_files, + modules, + file_map, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check whether application has performed an excessive number of small operations Parameters: @@ -139,17 +187,21 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr total_reads_small: number of read operations that has small size total_writes: number of write operations been executed by the application total_writes_small: number of write operations that has small size - detected_files: + detected_files: total_reads and total_writes in each file required columns: ['id', 'total_reads', 'total_writes'] modules: all different mudules been used in the application file_map: file id and file name pairing df_posix: all POSIX records - ''' - - if total_reads_small and total_reads_small / total_reads > thresholds['small_requests'][0] and total_reads_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( + """ + + if ( + total_reads_small + and total_reads_small / total_reads > thresholds["small_requests"][0] + and total_reads_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests".format( total_reads_small, total_reads_small / total_reads * 100.0 ) @@ -159,63 +211,93 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr dxt_trigger_time = 0 for index, row in detected_files.iterrows(): - if row['total_reads'] > (total_reads * thresholds['small_requests'][0] / 2): + if row["total_reads"] > (total_reads * thresholds["small_requests"][0] / 2): detail.append( { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - row['total_reads'], - row['total_reads'] / total_reads * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small read requests are to "{}"'.format( + row["total_reads"], + row["total_reads"] / total_reads * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])] - - if not temp_df.empty: - temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]] - small_read_requests_ranks = temp_df['rank'].unique() - if len(small_read_requests_ranks) > 0: - if len(small_read_requests_ranks) > 1 and int(small_read_requests_ranks[0]) == 0: - rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[1]))] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_read_data.loc[ + dxt_posix_read_data["id"] == int(row["id"]) + ] + + if not temp_df.empty: + temp_df = temp_df.loc[ + temp_df["length"] < thresholds["small_requests"][0] + ] + small_read_requests_ranks = temp_df["rank"].unique() + if len(small_read_requests_ranks) > 0: + if ( + len(small_read_requests_ranks) > 1 + and int(small_read_requests_ranks[0]) == 0 + ): + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_read_requests_ranks[1]) + ) + ] else: - rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[0]))] - - rank_df = rank_df['read_segments'].iloc[0] - rank_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_read_requests_ranks[0]) + ) + ] + + rank_df = rank_df["read_segments"].iloc[0] + rank_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ] res = set(list(address)) & set(rank_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0][ + "address_line_mapping" + ].loc[ + dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ].isin(res) + ] + if len(small_read_requests_ranks) > 0: detail.append( { - 'message': '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format( + "message": '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format( len(small_read_requests_ranks), - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) - + for index, row in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row['function_name'], - row['line_number'] - ) + "message": "{}: {}".format( + row["function_name"], row["line_number"] + ) } ) file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) @@ -223,40 +305,57 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr time_taken = end - start dxt_trigger_time += time_taken - if dxt_trigger_time > 0: + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation.append( { - 'message': 'Consider buffering read operations into larger more contiguous ones' + "message": "Consider buffering read operations into larger more contiguous ones" } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + "message": "Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-read.c"), + line_numbers=True, + background_color="default", + ), } ) else: recommendation.append( { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations" } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) - if total_writes_small and total_writes_small / total_writes > thresholds['small_requests'][0] and total_writes_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( + if ( + total_writes_small + and total_writes_small / total_writes > thresholds["small_requests"][0] + and total_writes_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests".format( total_writes_small, total_writes_small / total_writes * 100.0 ) @@ -266,106 +365,162 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] file_count = 0 for index, row in detected_files.iterrows(): - if row['total_writes'] > (total_writes * thresholds['small_requests'][0] / 2): + if row["total_writes"] > ( + total_writes * thresholds["small_requests"][0] / 2 + ): detail.append( { - 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( - row['total_writes'], - row['total_writes'] / total_writes * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small write requests are to "{}"'.format( + row["total_writes"], + row["total_writes"] / total_writes * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])] - - if not temp_df.empty: - temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]] - small_write_requests_ranks = temp_df['rank'].unique() + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_write_data.loc[ + dxt_posix_write_data["id"] == int(row["id"]) + ] + + if not temp_df.empty: + temp_df = temp_df.loc[ + temp_df["length"] < thresholds["small_requests"][0] + ] + small_write_requests_ranks = temp_df["rank"].unique() if len(small_write_requests_ranks) > 0: - if int(small_write_requests_ranks[0]) == 0 and len(small_write_requests_ranks) > 1: - rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[1]))] + if ( + int(small_write_requests_ranks[0]) == 0 + and len(small_write_requests_ranks) > 1 + ): + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_write_requests_ranks[1]) + ) + ] else: - rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))] - - rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))] - rank_df = rank_df['write_segments'].iloc[0] - rank_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_write_requests_ranks[0]) + ) + ] + + rank_df = temp.loc[ + (temp["rank"] == int(small_write_requests_ranks[0])) + ] + rank_df = rank_df["write_segments"].iloc[0] + rank_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ] res = set(list(address)) & set(rank_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0][ + "address_line_mapping" + ].loc[ + dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ].isin(res) + ] + if len(small_write_requests_ranks) > 0: detail.append( { - 'message': '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format( + "message": '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format( len(small_write_requests_ranks), - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) - + for index, row in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row['function_name'], - row['line_number'] - ) + "message": "{}: {}".format( + row["function_name"], row["line_number"] + ) } ) - + file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to previous files' + "message": "The backtrace information for this file is similar to previous files" } ) end = time.time() time_taken = end - start dxt_trigger_time += time_taken - + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation.append( { - 'message': 'Consider buffering write operations into larger more contiguous ones' + "message": "Consider buffering write operations into larger more contiguous ones" } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + "message": "Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-write.c"), + line_numbers=True, + background_color="default", + ), } ) else: recommendation.append( { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations" } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) -def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map=None, df_lustre=None, dxt_posix=None, dxt_posix_read_data=None): - ''' +def check_misaligned( + total_operations, + total_mem_not_aligned, + total_file_not_aligned, + modules, + file_map=None, + df_lustre=None, + dxt_posix=None, + dxt_posix_read_data=None, +): + """ Check whether application has excessive misaligned operations Parameters: @@ -373,62 +528,80 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali total_mem_not_aligned: number of memory requests not aligned total_file_not_aligned: number of file requests not aligned modules: all different mudules been used in the application - ''' - - if total_operations and total_mem_not_aligned / total_operations > thresholds['misaligned_requests'][0]: - thresholds['misaligned_requests'][1] = True - issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( + """ + + if ( + total_operations + and total_mem_not_aligned / total_operations + > thresholds["misaligned_requests"][0] + ): + thresholds["misaligned_requests"][1] = True + issue = "Application has a high number ({:.2f}%) of misaligned memory requests".format( total_mem_not_aligned / total_operations * 100.0 ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) + message( + INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + None, + ) ) - if total_operations and total_file_not_aligned / total_operations > thresholds['misaligned_requests'][0]: - thresholds['misaligned_requests'][1] = True - issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( + if ( + total_operations + and total_file_not_aligned / total_operations + > thresholds["misaligned_requests"][0] + ): + thresholds["misaligned_requests"][1] = True + issue = "Application issues a high number ({:.2f}%) of misaligned file requests".format( total_file_not_aligned / total_operations * 100.0 ) recommendation = [ { - 'message': 'Consider aligning the requests to the file system block boundaries' + "message": "Consider aligning the requests to the file system block boundaries" } ] - if 'HF5' in modules: + if "HF5" in modules: recommendation.append( { - 'message': 'Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-alignment.c'), line_numbers=True, background_color='default') + "message": "Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-alignment.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment' - } + "message": "Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment" + }, ) detail = [] - if 'LUSTRE' in modules: + if "LUSTRE" in modules: # DXT Analysis if args.backtrace: start = time.time() - - if not df_lustre['counters']['LUSTRE_STRIPE_SIZE'].empty: - stripe_size = df_lustre['counters']['LUSTRE_STRIPE_SIZE'].iloc[0] + + if not df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].empty: + stripe_size = df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].iloc[0] else: - stripe_size = df_lustre['counters']['POSIX_FILE_ALIGNMENT'].iloc[0] + stripe_size = df_lustre["counters"]["POSIX_FILE_ALIGNMENT"].iloc[0] file_count = 0 ids = dxt_posix.id.unique().tolist() for id in ids: - temp = dxt_posix.loc[dxt_posix['id'] == id] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id] + temp = dxt_posix.loc[dxt_posix["id"] == id] + temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id] misaligned_ranks = [] misaligned_ranks_opr = [] - + offsets = temp_df["offsets"].to_numpy().tolist() rank = temp_df["rank"].to_numpy().tolist() operation = temp_df["operation"].to_numpy().tolist() @@ -441,33 +614,46 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali if misaligned_ranks: misaligned_rank_ind = misaligned_ranks[0] misaligned_rank_opr = misaligned_ranks_opr[0] - misaligned_rank_df = temp.loc[(temp['rank'] == int(misaligned_rank_ind))] - if misaligned_rank_opr == 'read': - misaligned_rank_df = misaligned_rank_df['read_segments'].iloc[0] + misaligned_rank_df = temp.loc[ + (temp["rank"] == int(misaligned_rank_ind)) + ] + if misaligned_rank_opr == "read": + misaligned_rank_df = misaligned_rank_df[ + "read_segments" + ].iloc[0] else: - misaligned_rank_df = misaligned_rank_df['write_segments'].iloc[0] - misaligned_rank_stack_addresses = misaligned_rank_df['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + misaligned_rank_df = misaligned_rank_df[ + "write_segments" + ].iloc[0] + misaligned_rank_stack_addresses = misaligned_rank_df[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(misaligned_rank_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail.append( { - 'message': '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format( + "message": '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format( len(misaligned_ranks), - file_map[id] if args.full_path else os.path.basename(file_map[id]) - ) + file_map[id] + if args.full_path + else os.path.basename(file_map[id]), + ) } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) file_count += 1 @@ -476,23 +662,43 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) recommendation.append( { - 'message': 'Consider using a Lustre alignment that matches the file system stripe configuration', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider using a Lustre alignment that matches the file system stripe configuration", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), } ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) -def check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_traffic( + max_read_offset, + total_read_size, + max_write_offset, + total_written_size, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check whether application has redundant read or write traffic Parameters: @@ -500,10 +706,10 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ total_read_size: total size application has been read max_write_offset: max offset application is writing to total_written_size: total size application has been written - ''' + """ if max_read_offset > total_read_size: - issue = 'Application might have redundant read traffic (more data read than the highest offset)' + issue = "Application might have redundant read traffic (more data read than the highest offset)" detail = [] file_count = 0 @@ -513,67 +719,79 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == id] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == id] random_ranks_ind = -1 - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id] + temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id] updated_offsets = (temp_df["offsets"].to_numpy()).tolist() for i in range(len(updated_offsets)): - if updated_offsets.count(updated_offsets[i]) > 1: + if updated_offsets.count(updated_offsets[i]) > 1: redundant_ranks_ind = i break if random_ranks_ind != -1: - random_rank = temp_df.iloc[redundant_ranks_ind]['rank'] - random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['read_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[redundant_ranks_ind]["rank"] + random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["read_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] + detail.append( { - 'message': 'The backtrace information for these redundant read call(s) is given below:' + "message": "The backtrace information for these redundant read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) + message( + INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None + ) ) if max_write_offset > total_written_size: - issue = 'Application might have redundant write traffic (more data written than the highest offset)' + issue = "Application might have redundant write traffic (more data written than the highest offset)" detail = [] file_count = 0 @@ -583,70 +801,105 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == id] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == id] random_ranks_ind = -1 - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id] + temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id] updated_offsets = (temp_df["offsets"].to_numpy()).tolist() for i in range(len(updated_offsets)): - if updated_offsets.count(updated_offsets[i]) > 1: + if updated_offsets.count(updated_offsets[i]) > 1: redundant_ranks_ind = i break if random_ranks_ind != -1: - random_rank = temp_df.iloc[redundant_ranks_ind]['rank'] - random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['write_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[redundant_ranks_ind]["rank"] + random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["write_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] + detail.append( { - 'message': 'The backtrace information for these redundant write call(s) is given below:' + "message": "The backtrace information for these redundant write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None, detail) + message( + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + None, + detail, + ) ) insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) + message( + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + None, + ) ) -def check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_random_operation( + read_consecutive, + read_sequential, + read_random, + total_reads, + write_consecutive, + write_sequential, + write_random, + total_writes, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check whether application has performed excessive random operations Parameters: @@ -658,19 +911,23 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total write_sequential: number of sequential write operations write_random: number of random write operations total_write: number of write operations been executed by the application - ''' + """ if total_reads: - if read_random and read_random / total_reads > thresholds['random_operations'][0] and read_random > thresholds['random_operations_absolute'][0]: - thresholds['random_operations'][1] = True - thresholds['random_operations_absolute'][1] = True - issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( + if ( + read_random + and read_random / total_reads > thresholds["random_operations"][0] + and read_random > thresholds["random_operations_absolute"][0] + ): + thresholds["random_operations"][1] = True + thresholds["random_operations_absolute"][1] = True + issue = "Application is issuing a high number ({}) of random read operations ({:.2f}%)".format( read_random, read_random / total_reads * 100.0 ) recommendation = [ { - 'message': 'Consider changing your data model to have consecutive or sequential reads' + "message": "Consider changing your data model to have consecutive or sequential reads" } ] @@ -679,11 +936,11 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - temp = dxt_posix.loc[dxt_posix['id'] == id] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id] - temp_df = temp_df.sort_values('start_time', ascending=True) + temp = dxt_posix.loc[dxt_posix["id"] == id] + temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id] + temp_df = temp_df.sort_values("start_time", ascending=True) random_ranks_ind = -1 - + if not temp_df["offsets"].is_monotonic_increasing: updated_offsets = (temp_df["offsets"].to_numpy()).tolist() cur = 0 @@ -694,64 +951,90 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total cur = updated_offsets[i] if random_ranks_ind != -1: - random_rank = temp_df.iloc[random_ranks_ind]['rank'] - random_offsets = temp_df.iloc[random_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['read_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[random_ranks_ind]["rank"] + random_offsets = temp_df.iloc[random_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["read_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail = [] detail.append( { - 'message': 'The backtrace information for these random read call(s) is given below:' + "message": "The backtrace information for these random read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( + issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests".format( read_consecutive / total_reads * 100.0, - read_sequential / total_reads * 100.0 + read_sequential / total_reads * 100.0, ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) + message( + INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, + TARGET_DEVELOPER, + OK, + issue, + None, + ) ) if total_writes: - if write_random and write_random / total_writes > thresholds['random_operations'][0] and write_random > thresholds['random_operations_absolute'][0]: - thresholds['random_operations'][1] = True - thresholds['random_operations_absolute'][1] = True - issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( + if ( + write_random + and write_random / total_writes > thresholds["random_operations"][0] + and write_random > thresholds["random_operations_absolute"][0] + ): + thresholds["random_operations"][1] = True + thresholds["random_operations_absolute"][1] = True + issue = "Application is issuing a high number ({}) of random write operations ({:.2f}%)".format( write_random, write_random / total_writes * 100.0 ) recommendation = [ { - 'message': 'Consider changing your data model to have consecutive or sequential writes' + "message": "Consider changing your data model to have consecutive or sequential writes" } ] @@ -760,10 +1043,10 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - temp = dxt_posix.loc[dxt_posix['id'] == id] + temp = dxt_posix.loc[dxt_posix["id"] == id] - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id] - temp_df.sort_values('start_time', ascending=True, inplace=True) + temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id] + temp_df.sort_values("start_time", ascending=True, inplace=True) random_ranks_ind = -1 if not temp_df["offsets"].is_monotonic_increasing: updated_offsets = (temp_df["offsets"].to_numpy()).tolist() @@ -775,58 +1058,87 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total cur = updated_offsets[i] if random_ranks_ind != -1: - random_rank = temp_df.iloc[random_ranks_ind]['rank'] - random_offsets = temp_df.iloc[random_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['write_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[random_ranks_ind]["rank"] + random_offsets = temp_df.iloc[random_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["write_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail = [] detail.append( { - 'message': 'The backtrace information for these random write call(s) is given below:' + "message": "The backtrace information for these random write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) - + end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( + issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests".format( write_consecutive / total_writes * 100.0, - write_sequential / total_writes * 100.0 + write_sequential / total_writes * 100.0, ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) + message( + INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, + TARGET_DEVELOPER, + OK, + issue, + None, + ) ) -def check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map): - ''' +def check_shared_small_operation( + total_shared_reads, + total_shared_reads_small, + total_shared_writes, + total_shared_writes_small, + shared_files, + file_map, +): + """ Check whether there are excessive small requests in shared files Parameters: @@ -838,113 +1150,182 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t small reads an small writes in each shared file required columns: ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] file_map: file id and file name pairing - ''' - - if total_shared_reads and total_shared_reads_small / total_shared_reads > thresholds['small_requests'][0] and total_shared_reads_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests'][1] = True - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( - total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 + """ + + if ( + total_shared_reads + and total_shared_reads_small / total_shared_reads + > thresholds["small_requests"][0] + and total_shared_reads_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests"][1] = True + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests".format( + total_shared_reads_small, + total_shared_reads_small / total_shared_reads * 100.0, ) detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * thresholds['small_requests'][0] / 2): + if row["INSIGHTS_POSIX_SMALL_READS"] > ( + total_shared_reads * thresholds["small_requests"][0] / 2 + ): detail.append( { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - row['INSIGHTS_POSIX_SMALL_READS'], - row['INSIGHTS_POSIX_SMALL_READS'] / total_shared_reads * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small read requests are to "{}"'.format( + row["INSIGHTS_POSIX_SMALL_READS"], + row["INSIGHTS_POSIX_SMALL_READS"] + / total_shared_reads + * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) recommendation = [ { - 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + "message": "Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-read.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) - if total_shared_writes and total_shared_writes_small / total_shared_writes > thresholds['small_requests'][0] and total_shared_writes_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests'][1] = True - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( - total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 + if ( + total_shared_writes + and total_shared_writes_small / total_shared_writes + > thresholds["small_requests"][0] + and total_shared_writes_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests"][1] = True + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests".format( + total_shared_writes_small, + total_shared_writes_small / total_shared_writes * 100.0, ) detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * thresholds['small_requests'][0] / 2): + if row["INSIGHTS_POSIX_SMALL_WRITES"] > ( + total_shared_writes * thresholds["small_requests"][0] / 2 + ): detail.append( { - 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( - row['INSIGHTS_POSIX_SMALL_WRITES'], - row['INSIGHTS_POSIX_SMALL_WRITES'] / total_shared_writes * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small writes requests are to "{}"'.format( + row["INSIGHTS_POSIX_SMALL_WRITES"], + row["INSIGHTS_POSIX_SMALL_WRITES"] + / total_shared_writes + * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) recommendation = [ { - 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + "message": "Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-write.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) def check_long_metadata(count_long_metadata, modules): - ''' + """ Check how many ranks have metadata operations taking too long Parameters: count_long_metadata: number of ranks that have metadata operations taking too long modules: all different mudules been used in the application - ''' + """ if count_long_metadata > 0: - thresholds['metadata_time_rank'][1] = True - issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - count_long_metadata, thresholds['metadata_time_rank'][0] + thresholds["metadata_time_rank"][1] = True + issue = ( + "There are {} ranks where metadata operations take over {} seconds".format( + count_long_metadata, thresholds["metadata_time_rank"][0] + ) ) recommendation = [ { - 'message': 'Attempt to combine files, reduce, or cache metadata operations' + "message": "Attempt to combine files, reduce, or cache metadata operations" } ] - if 'HF5' in modules: + if "HF5" in modules: recommendation.append( { - 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') + "message": "Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-collective-metadata.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') - } + "message": "Since your appplication uses HDF5, try using metadata cache to defer metadata operations", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-cache.c"), + line_numbers=True, + background_color="default", + ), + }, ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_METADATA_TIME, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) -def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_shared_data_imblance( + stragglers_count, + detected_files, + file_map, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check how many shared files containing data transfer imbalance Parameters: @@ -953,11 +1334,11 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p data imbalance per file required columns: ['id', 'data_imbalance'] file_map: file id and file name pairing - ''' + """ if stragglers_count: - thresholds['imbalance_stragglers'][1] = True - issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( + thresholds["imbalance_stragglers"][1] = True + issue = "Detected data transfer imbalance caused by stragglers when accessing {} shared file.".format( stragglers_count ) @@ -968,52 +1349,73 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['data_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["data_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df_1 = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])] - temp_df_2 = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])] - - df_merged = pd.concat([temp_df_1, temp_df_2], ignore_index=True, sort=False) - df_merged['duration'] = df_merged['end_time'] - df_merged['start_time'] - df_merged.sort_values('duration', ascending=True, inplace=True) + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df_1 = dxt_posix_write_data.loc[ + dxt_posix_write_data["id"] == int(row["id"]) + ] + temp_df_2 = dxt_posix_read_data.loc[ + dxt_posix_read_data["id"] == int(row["id"]) + ] + + df_merged = pd.concat( + [temp_df_1, temp_df_2], ignore_index=True, sort=False + ) + df_merged["duration"] = ( + df_merged["end_time"] - df_merged["start_time"] + ) + df_merged.sort_values("duration", ascending=True, inplace=True) df_merged = df_merged.iloc[0] - rank_df = temp.loc[(temp['rank'] == int(df_merged['rank']))] - - if df_merged['operation'] == 'write': - rank_df = rank_df['write_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = temp.loc[(temp["rank"] == int(df_merged["rank"]))] + + if df_merged["operation"] == "write": + rank_df = rank_df["write_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] else: - rank_df = rank_df['read_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = rank_df["read_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail.append( { - 'message': 'The backtrace information for these imbalanced call(s) is given below:' + "message": "The backtrace information for these imbalanced call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) @@ -1021,69 +1423,94 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) - + end = time.time() time_taken = end - start dxt_trigger_time += time_taken - - if dxt_trigger_time > 0: + + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_SIZE_IMBALANCE, + TARGET_USER, + HIGH, + issue, + recommendation, + detail, + ) ) -def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size): - ''' +def check_shared_data_imblance_split( + slowest_rank_bytes, fastest_rank_bytes, total_transfer_size +): + """ Check whether the specific shared file contains data imbalance Parameters: slowest_rank_bytes: the total request size of the rank that takes the longest data operation time fastest_rank_bytes: the total request size of the rank that takes the shortest data operation time total_transfer_size: total request size of that specific shared file - ''' - - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]: - thresholds['imbalance_stragglers'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( + """ + + if ( + total_transfer_size + and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size + > thresholds["imbalance_stragglers"][0] + ): + thresholds["imbalance_stragglers"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation + ) ) def check_shared_time_imbalance(stragglers_count, detected_files, file_map): - ''' + """ Check how many shared files containing time transfer imbalance Parameters: @@ -1092,74 +1519,101 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map): data imbalance per file required columns: ['id', 'time_imbalance'] file_map: file id and file name pairing - ''' + """ if stragglers_count: - thresholds['imbalance_stragglers'][1] = True - issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( + thresholds["imbalance_stragglers"][1] = True + issue = "Detected time imbalance caused by stragglers when accessing {} shared file.".format( stragglers_count ) detail = [] - + for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['time_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["time_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) recommendation = [ { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + "message": "Consider better distributing the data in the parallel file system" # needs to review what suggestion to give }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_TIME_IMBALANCE, + TARGET_USER, + HIGH, + issue, + recommendation, + detail, + ) ) -def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time): - ''' +def check_shared_time_imbalance_split( + slowest_rank_time, fastest_rank_time, total_transfer_time +): + """ Check whether the specific shared file contains time imbalance Parameters: slowest_rank_bytes: the total request time of the rank that takes the longest data operation time fastest_rank_bytes: the total request time of the rank that takes the shortest data operation time total_transfer_size: total request time of that specific shared file - ''' - - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]: - thresholds['imbalance_stragglers'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( + """ + + if ( + total_transfer_time + and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time + > thresholds["imbalance_stragglers"][0] + ): + thresholds["imbalance_stragglers"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 ) recommendation = [ { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + "message": "Consider better distributing the data in the parallel file system" # needs to review what suggestion to give }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation + ) ) -def check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None): - ''' +def check_individual_write_imbalance( + imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None +): + """ Check how many write imbalance when accessing individual files Parameters: @@ -1167,57 +1621,62 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map, detected_files: write imbalance per file required columns: ['id', 'write_imbalance'] - ''' + """ if imbalance_count: - thresholds['imbalance_size'][1] = True - issue = 'Detected write imbalance when accessing {} individual files'.format( + thresholds["imbalance_size"][1] = True + issue = "Detected write imbalance when accessing {} individual files".format( imbalance_count ) detail = [] file_count = 0 dxt_trigger_time = 0 - + for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['write_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["write_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])] - - maxClm = temp_df['length'].max() - temp_df = temp_df.loc[(temp_df['length'] == maxClm)] - rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))] - - rank_df = rank_df['write_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_write_data.loc[ + dxt_posix_write_data["id"] == int(row["id"]) + ] + + maxClm = temp_df["length"].max() + temp_df = temp_df.loc[(temp_df["length"] == maxClm)] + rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))] + + rank_df = rank_df["write_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these imbalanced write call(s) is given below:' + "message": "The backtrace information for these imbalanced write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) @@ -1225,82 +1684,119 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map, else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } - ) - + ) + end = time.time() time_taken = end - start - dxt_trigger_time += time_taken - - if dxt_trigger_time > 0: + dxt_trigger_time += time_taken + + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written): - ''' + """ Check whether there is write imbalance in the specific individual file Parameters: max_bytes_written: max byte written in the file min_bytes_written: minimum byte written in the file - ''' - - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]: - thresholds['imbalance_size'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + """ + + if ( + max_bytes_written + and abs(max_bytes_written - min_bytes_written) / max_bytes_written + > thresholds["imbalance_size"][0] + ): + thresholds["imbalance_size"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( + abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) -def check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None): - ''' +def check_individual_read_imbalance( + imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None +): + """ Check how many read imbalance when accessing individual files Parameters: @@ -1308,57 +1804,62 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d detected_files: read imbalance per file required columns: ['id', 'read_imbalance'] - ''' + """ if imbalance_count: - thresholds['imbalance_size'][1] = True - issue = 'Detected read imbalance when accessing {} individual files.'.format( + thresholds["imbalance_size"][1] = True + issue = "Detected read imbalance when accessing {} individual files.".format( imbalance_count ) detail = [] file_count = 0 dxt_trigger_time = 0 - + for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['read_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["read_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])] - - maxClm = temp_df['length'].max() - temp_df = temp_df.loc[(temp_df['length'] == maxClm)] - rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))] - - rank_df = rank_df['read_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_read_data.loc[ + dxt_posix_read_data["id"] == int(row["id"]) + ] + + maxClm = temp_df["length"].max() + temp_df = temp_df.loc[(temp_df["length"] == maxClm)] + rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))] + + rank_df = rank_df["read_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these imbalanced read call(s) is given below:' + "message": "The backtrace information for these imbalanced read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) @@ -1366,84 +1867,126 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) end = time.time() time_taken = end - start dxt_trigger_time += time_taken - if dxt_trigger_time > 0: + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): - ''' + """ Check whether there is read imbalance in the specific individual file Parameters: max_bytes_written: max byte read in the file min_bytes_written: minimum byte read in the file - ''' - - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]: - thresholds['imbalance_size'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + """ + + if ( + max_bytes_read + and abs(max_bytes_read - min_bytes_read) / max_bytes_read + > thresholds["imbalance_size"][0] + ): + thresholds["imbalance_size"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( + abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) # MPIIO level check -def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio=None): - ''' +def check_mpi_collective_read_operation( + mpiio_coll_reads, + mpiio_indep_reads, + total_mpiio_read_operations, + detected_files, + file_map, + dxt_mpiio=None, +): + """ Check whether application uses collective mpi read calls Parameters: @@ -1454,14 +1997,17 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot independent read operations and percentage per file required columns: ['id', 'absolute_indep_reads', 'percent_indep_reads'] file_map: file id and file name pairing - ''' + """ if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: - thresholds['collective_operations_absolute'][1] = True - issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( - mpiio_indep_reads, - mpiio_indep_reads / total_mpiio_read_operations * 100 + if ( + total_mpiio_read_operations + and total_mpiio_read_operations + > thresholds["collective_operations_absolute"][0] + ): + thresholds["collective_operations_absolute"][1] = True + issue = "Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls".format( + mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100 ) detail = [] @@ -1471,63 +2017,80 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot for index, row in detected_files.iterrows(): detail.append( { - 'message': '{} ({}%) of independent reads to "{}"'.format( - row['absolute_indep_reads'], - row['percent_indep_reads'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({}%) of independent reads to "{}"'.format( + row["absolute_indep_reads"], + row["percent_indep_reads"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)] - temp = temp['read_segments'].iloc[0] - stack_memory_addresses = temp['stack_memory_addresses'].iloc[0] - address = dxt_mpiio.iloc[0]['address_line_mapping']['address'] + temp = dxt_mpiio.loc[ + (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1) + ] + temp = temp["read_segments"].iloc[0] + stack_memory_addresses = temp["stack_memory_addresses"].iloc[0] + address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[ + dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these read call(s) is given below:' + "message": "The backtrace information for these read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) - + end = time.time() time_taken = end - start dxt_trigger_time += time_taken - - if dxt_trigger_time > 0: + + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + "message": "Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-read.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) else: - issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_reads, - mpiio_coll_reads / total_mpiio_read_operations * 100 + issue = "Application uses MPI-IO and read data using {} ({:.2f}%) collective operations".format( + mpiio_coll_reads, mpiio_coll_reads / total_mpiio_read_operations * 100 ) insights_operation.append( @@ -1535,8 +2098,15 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot ) -def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio=None): - ''' +def check_mpi_collective_write_operation( + mpiio_coll_writes, + mpiio_indep_writes, + total_mpiio_write_operations, + detected_files, + file_map, + dxt_mpiio=None, +): + """ Check whether application uses collective mpi write calls Parameters: @@ -1547,14 +2117,18 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, independent write operations and percentage per file required columns: ['id', 'absolute_indep_writes', 'percent_indep_writes'] file_map: file id and file name pairing - ''' + """ if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: - thresholds['collective_operations_absolute'][1] = True - issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( + if ( + total_mpiio_write_operations + and total_mpiio_write_operations + > thresholds["collective_operations_absolute"][0] + ): + thresholds["collective_operations_absolute"][1] = True + issue = "Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls".format( mpiio_indep_writes, - mpiio_indep_writes / total_mpiio_write_operations * 100 + mpiio_indep_writes / total_mpiio_write_operations * 100, ) detail = [] @@ -1564,62 +2138,79 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, for index, row in detected_files.iterrows(): detail.append( { - 'message': '{} ({}%) independent writes to "{}"'.format( - row['absolute_indep_writes'], - row['percent_indep_writes'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({}%) independent writes to "{}"'.format( + row["absolute_indep_writes"], + row["percent_indep_writes"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)] - temp = temp['write_segments'].iloc[0] - stack_memory_addresses = temp['stack_memory_addresses'].iloc[0] - address = dxt_mpiio.iloc[0]['address_line_mapping']['address'] + temp = dxt_mpiio.loc[ + (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1) + ] + temp = temp["write_segments"].iloc[0] + stack_memory_addresses = temp["stack_memory_addresses"].iloc[0] + address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[ + dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these write call(s) is given below:' + "message": "The backtrace information for these write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) end = time.time() time_taken = end - start dxt_trigger_time += time_taken - + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + "message": "Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-write.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) else: - issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_writes, - mpiio_coll_writes / total_mpiio_write_operations * 100 + issue = "Application uses MPI-IO and write data using {} ({:.2f}%) collective operations".format( + mpiio_coll_writes, mpiio_coll_writes / total_mpiio_write_operations * 100 ) insights_operation.append( @@ -1627,8 +2218,10 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, ) -def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules): - ''' +def check_mpi_none_block_operation( + mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules +): + """ Check whether application can benefit from non-blocking requests Parameters: @@ -1636,93 +2229,131 @@ def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_ext mpiio_nb_writes: number of non-blocking mpi write operations has_hdf5_extension: boolean value of whether the file in in hdf5 extension modules: all different mudules been used in the application - ''' + """ if mpiio_nb_reads == 0: - issue = 'Application could benefit from non-blocking (asynchronous) reads' + issue = "Application could benefit from non-blocking (asynchronous) reads" recommendation = [] - if 'H5F' in modules or has_hdf5_extension: + if "H5F" in modules or has_hdf5_extension: recommendation.append( { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') + "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-vol-async-read.c"), + line_numbers=True, + background_color="default", + ), } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') + "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations", # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-iread.c"), + line_numbers=True, + background_color="default", + ), } ) insights_operation.append( - message(INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message( + INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + recommendation, + ) ) if mpiio_nb_writes == 0: - issue = 'Application could benefit from non-blocking (asynchronous) writes' + issue = "Application could benefit from non-blocking (asynchronous) writes" recommendation = [] - if 'H5F' in modules or has_hdf5_extension: + if "H5F" in modules or has_hdf5_extension: recommendation.append( { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') + "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-vol-async-write.c"), + line_numbers=True, + background_color="default", + ), } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') + "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations", # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-iwrite.c"), + line_numbers=True, + background_color="default", + ), } ) insights_operation.append( - message(INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message( + INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + recommendation, + ) ) def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): - ''' + """ Check whether application has used inter-node aggregators Parameters: - cb_nodes: + cb_nodes: NUMBER_OF_COMPUTE_NODES: - ''' + """ if cb_nodes > NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using inter-node aggregators (which require network communication)' + issue = "Application is using inter-node aggregators (which require network communication)" recommendation = [ { - 'message': 'Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})'.format( + "message": "Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})".format( NUMBER_OF_COMPUTE_NODES ), - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-hints.bash'), line_numbers=True, background_color='default') + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-hints.bash"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation) + message( + INSIGHTS_MPI_IO_AGGREGATORS_INTER, + TARGET_USER, + HIGH, + issue, + recommendation, + ) ) if cb_nodes < NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using intra-node aggregators' + issue = "Application is using intra-node aggregators" insights_operation.append( message(INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue) ) if cb_nodes == NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using one aggregator per compute node' + issue = "Application is using one aggregator per compute node" insights_operation.append( message(INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue) @@ -1731,65 +2362,75 @@ def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): # Layout and export + def display_content(console): if insights_metadata: console.print( Panel( - Padding( - Group( - *insights_metadata - ), - (1, 1) - ), - title='METADATA', - title_align='left' + Padding(Group(*insights_metadata), (1, 1)), + title="METADATA", + title_align="left", ) ) if insights_operation: console.print( Panel( - Padding( - Group( - *insights_operation - ), - (1, 1) - ), - title='OPERATIONS', - title_align='left' + Padding(Group(*insights_operation), (1, 1)), + title="OPERATIONS", + title_align="left", ) ) if insights_dxt: console.print( Panel( - Padding( - Group( - *insights_dxt - ), - (1, 1) - ), - title='DXT', - title_align='left' + Padding(Group(*insights_dxt), (1, 1)), title="DXT", title_align="left" ) ) def display_thresholds(console): tholdMessage = { - 'imbalance_operations': 'Minimum imbalance requests ratio: [white]{}%[/white]'.format(thresholds['imbalance_operations'][0] * 100), - 'small_bytes': 'Minimum size of a small request: [white]{} bytes[/white]'.format(thresholds['small_bytes'][0]), - 'small_requests': 'Maximum small requests ratio: [white]{}%[/white]'.format(thresholds['small_requests'][0] * 100), - 'small_requests_absolute': 'Maximum small requests: [white]{}[/white]'.format(thresholds['small_requests_absolute'][0]), - 'misaligned_requests': 'Maximum misaligned requests ratio: [white]{}%[/white]'.format(thresholds['misaligned_requests'][0] * 100), - 'random_operations': 'Maximum random request ratio: [white]{}%[/white]'.format(thresholds['random_operations'][0] * 100), - 'random_operations_absolute': 'Maximum random requests: [white]{}[/white]'.format(thresholds['random_operations_absolute'][0]), - 'metadata_time_rank': 'Maximum metadata process time per rank: [white]{} seconds[/white]'.format(thresholds['metadata_time_rank'][0]), - 'imbalance_size': 'Maximum read/write size difference ratio: [white]{}%[/white]'.format(thresholds['imbalance_size'][0] * 100), - 'imbalance_stragglers': 'Maximum ratio difference among ranks: [white]{}%[/white]'.format(thresholds['imbalance_stragglers'][0] * 100), - 'interface_stdio': 'Maximum STDIO usage ratio: [white]{}%[/white]'.format(thresholds['interface_stdio'][0] * 100), - 'collective_operations': 'Minimum MPI collective operation usage ratio: [white]{}%[/white]'.format(thresholds['collective_operations'][0] * 100), - 'collective_operations_absolute': 'Minimum MPI collective operations: [white]{}[/white]'.format(thresholds['collective_operations_absolute'][0]), + "imbalance_operations": "Minimum imbalance requests ratio: [white]{}%[/white]".format( + thresholds["imbalance_operations"][0] * 100 + ), + "small_bytes": "Minimum size of a small request: [white]{} bytes[/white]".format( + thresholds["small_bytes"][0] + ), + "small_requests": "Maximum small requests ratio: [white]{}%[/white]".format( + thresholds["small_requests"][0] * 100 + ), + "small_requests_absolute": "Maximum small requests: [white]{}[/white]".format( + thresholds["small_requests_absolute"][0] + ), + "misaligned_requests": "Maximum misaligned requests ratio: [white]{}%[/white]".format( + thresholds["misaligned_requests"][0] * 100 + ), + "random_operations": "Maximum random request ratio: [white]{}%[/white]".format( + thresholds["random_operations"][0] * 100 + ), + "random_operations_absolute": "Maximum random requests: [white]{}[/white]".format( + thresholds["random_operations_absolute"][0] + ), + "metadata_time_rank": "Maximum metadata process time per rank: [white]{} seconds[/white]".format( + thresholds["metadata_time_rank"][0] + ), + "imbalance_size": "Maximum read/write size difference ratio: [white]{}%[/white]".format( + thresholds["imbalance_size"][0] * 100 + ), + "imbalance_stragglers": "Maximum ratio difference among ranks: [white]{}%[/white]".format( + thresholds["imbalance_stragglers"][0] * 100 + ), + "interface_stdio": "Maximum STDIO usage ratio: [white]{}%[/white]".format( + thresholds["interface_stdio"][0] * 100 + ), + "collective_operations": "Minimum MPI collective operation usage ratio: [white]{}%[/white]".format( + thresholds["collective_operations"][0] * 100 + ), + "collective_operations_absolute": "Minimum MPI collective operations: [white]{}[/white]".format( + thresholds["collective_operations_absolute"][0] + ), } toBeAppend = [] @@ -1802,24 +2443,19 @@ def display_thresholds(console): toBeAppend.append(message) console.print( - Panel( - '\n'.join(toBeAppend), - title='THRESHOLDS', - title_align='left', - padding=1 - ) + Panel("\n".join(toBeAppend), title="THRESHOLDS", title_align="left", padding=1) ) def display_footer(console, insights_start_time, insights_end_time): console.print( Panel( - ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( + " {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds".format( datetime.datetime.now().year, datetime.datetime.now(), - insights_end_time - insights_start_time + insights_end_time - insights_start_time, ), - box=box.SIMPLE + box=box.SIMPLE, ) ) @@ -1828,37 +2464,28 @@ def export_html(console, export_dir, trace_name): if not args.export_html: return - os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists + os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists filepath = os.path.join(export_dir, f"{trace_name}.html") - console.save_html( - filepath, - theme=set_export_theme(), - clear=False - ) + console.save_html(filepath, theme=set_export_theme(), clear=False) def export_svg(console, export_dir, trace_name): if not args.export_svg: return - - os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists + + os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists filepath = os.path.join(export_dir, f"{trace_name}.svg") - console.save_svg( - filepath, - title='Drishti', - theme=set_export_theme(), - clear=False - ) + console.save_svg(filepath, title="Drishti", theme=set_export_theme(), clear=False) def export_csv(export_dir, trace_name, jobid=None): if not args.export_csv: return - + issues = [ - 'JOB', + "JOB", INSIGHTS_STDIO_HIGH_USAGE, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, @@ -1890,23 +2517,21 @@ def export_csv(export_dir, trace_name, jobid=None): INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, INSIGHTS_MPI_IO_AGGREGATORS_INTRA, INSIGHTS_MPI_IO_AGGREGATORS_INTER, - INSIGHTS_MPI_IO_AGGREGATORS_OK + INSIGHTS_MPI_IO_AGGREGATORS_OK, ] if codes: issues.extend(codes) detected_issues = dict.fromkeys(issues, False) - detected_issues['JOB'] = jobid + detected_issues["JOB"] = jobid for report in csv_report: detected_issues[report] = True - - os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists + os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists filepath = os.path.join(export_dir, f"{trace_name}.csv") - with open(filepath, 'w') as f: + with open(filepath, "w") as f: w = csv.writer(f) w.writerow(detected_issues.keys()) w.writerow(detected_issues.values()) - From 478612a3ed3a9ea0c4368f5d8bdb6190e07587a7 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 17 Mar 2025 23:22:20 -0600 Subject: [PATCH 02/43] chore: Explicitly define imports from `drishti.includes.config` --- drishti/includes/module.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drishti/includes/module.py b/drishti/includes/module.py index c0d91ef..e7f70d6 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -1,12 +1,37 @@ #!/usr/bin/env python3 -import datetime import csv +import datetime +import os import time + import pandas as pd from rich import box +from rich.console import Group +from rich.padding import Padding +from rich.panel import Panel from rich.syntax import Syntax + from drishti.includes.config import * +from drishti.includes.config import ( + HIGH, + INFO, + OK, + ROOT, + TARGET_DEVELOPER, + TARGET_USER, + WARN, + codes, + convert_bytes, + csv_report, + insights_dxt, + insights_metadata, + insights_operation, + message, + set_export_theme, + thresholds, +) +from drishti.includes.parser import args """ Before calling the functions below From 23e64e20a796d28ecef27ea2b699aed14a8b5e57 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 17 Mar 2025 23:27:01 -0600 Subject: [PATCH 03/43] chore: Add PyCharm configuration files --- .idea/.gitignore | 8 ++++++++ .idea/drishti-io.iml | 14 ++++++++++++++ .idea/inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 ++++++++ .idea/vcs.xml | 6 ++++++ 6 files changed, 46 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/drishti-io.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml new file mode 100644 index 0000000..7b26d7f --- /dev/null +++ b/.idea/drishti-io.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..1d40550 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c4fcf4c --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 4a83149f5006d5bae38f8b3821e0dd0a47db0d6c Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 24 Mar 2025 11:51:19 -0600 Subject: [PATCH 04/43] feat: Update log path argument to support multiple inputs and enhance log type checking --- drishti/includes/parser.py | 120 +++++++++++++++++-------------------- drishti/reporter.py | 62 +++++++++++++------ 2 files changed, 101 insertions(+), 81 deletions(-) diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py index 28dcd63..ed58b1d 100644 --- a/drishti/includes/parser.py +++ b/drishti/includes/parser.py @@ -1,128 +1,120 @@ import argparse -parser = argparse.ArgumentParser( - description='Drishti: ' -) +parser = argparse.ArgumentParser(description="Drishti: ") parser.add_argument( - 'log_path', - help='Input .darshan file or recorder folder' + "log_paths", nargs="+", help="Input .darshan file or recorder folder" ) parser.add_argument( - '--issues', + "--issues", default=False, - action='store_true', - dest='only_issues', - help='Only displays the detected issues and hides the recommendations' + action="store_true", + dest="only_issues", + help="Only displays the detected issues and hides the recommendations", ) parser.add_argument( - '--html', + "--html", default=False, - action='store_true', - dest='export_html', - help='Export the report as an HTML page' + action="store_true", + dest="export_html", + help="Export the report as an HTML page", ) parser.add_argument( - '--svg', + "--svg", default=False, - action='store_true', - dest='export_svg', - help='Export the report as an SVG image' + action="store_true", + dest="export_svg", + help="Export the report as an SVG image", ) parser.add_argument( - '--light', + "--light", default=False, - action='store_true', - dest='export_theme_light', - help='Use a light theme for the report when generating files' + action="store_true", + dest="export_theme_light", + help="Use a light theme for the report when generating files", ) parser.add_argument( - '--size', + "--size", default=False, - dest='export_size', - help='Console width used for the report and generated files' + dest="export_size", + help="Console width used for the report and generated files", ) parser.add_argument( - '--verbose', + "--verbose", default=False, - action='store_true', - dest='verbose', - help='Display extended details for the recommendations' + action="store_true", + dest="verbose", + help="Display extended details for the recommendations", ) parser.add_argument( - '--threshold', + "--threshold", default=False, - action='store_true', - dest='thold', - help='Display all thresholds used for the report' + action="store_true", + dest="thold", + help="Display all thresholds used for the report", ) parser.add_argument( - '--code', + "--code", default=False, - action='store_true', - dest='code', - help='Display insights identification code' + action="store_true", + dest="code", + help="Display insights identification code", ) parser.add_argument( - '--backtrace', + "--backtrace", default=False, - action='store_true', - dest='backtrace', - help='Enable DXT insights and backtrace' + action="store_true", + dest="backtrace", + help="Enable DXT insights and backtrace", ) parser.add_argument( - '--path', + "--path", default=False, - action='store_true', - dest='full_path', - help='Display the full file path for the files that triggered the issue' + action="store_true", + dest="full_path", + help="Display the full file path for the files that triggered the issue", ) parser.add_argument( - '--csv', + "--csv", default=False, - action='store_true', - dest='export_csv', - help='Export a CSV with the code of all issues that were triggered' + action="store_true", + dest="export_csv", + help="Export a CSV with the code of all issues that were triggered", ) parser.add_argument( - '--export_dir', + "--export_dir", default="", - dest='export_dir', - help='Specify the directory prefix for the output files (if any)' + dest="export_dir", + help="Specify the directory prefix for the output files (if any)", ) -parser.add_argument( - '--json', - default=False, - dest='json', - help=argparse.SUPPRESS -) +parser.add_argument("--json", default=False, dest="json", help=argparse.SUPPRESS) parser.add_argument( - '--split', + "--split", default=False, - action='store_true', - dest='split_files', - help='Split the files and generate report for each file' + action="store_true", + dest="split_files", + help="Split the files and generate report for each file", ) parser.add_argument( - '--config', + "--config", default=False, - dest='config', - help='Enable thresholds read from json file' + dest="config", + help="Enable thresholds read from json file", ) args = parser.parse_args() diff --git a/drishti/reporter.py b/drishti/reporter.py index 8455040..a6a8401 100644 --- a/drishti/reporter.py +++ b/drishti/reporter.py @@ -3,10 +3,12 @@ import os import sys from subprocess import call -from drishti.includes.parser import * +from typing import List, Optional +# from includes.parser import * # imports {'parser', 'args', 'argparse'} +from drishti.includes.parser import args -''' +""" |- handler_darshan -| | | reporter -> /handlers -> |- handler_recorder -| -| @@ -15,8 +17,7 @@ ________________________________________________| | |-----> /includes -> module -> config -> parser -''' - +""" LOG_TYPE_DARSHAN = 0 LOG_TYPE_RECORDER = 1 @@ -26,30 +27,57 @@ def clear(): """ Clear the screen with the comment call based on the operating system. """ - _ = call('clear' if os.name == 'posix' else 'cls') + _ = call("clear" if os.name == "posix" else "cls") + + +def check_log_type(paths: List[str]) -> Optional[int]: + is_darshan = True + is_recorder = True + multiple_logs = len(paths) > 1 + for path in paths: + if path.endswith(".darshan"): + if not os.path.isfile(path): + print("Unable to open .darshan file.") + sys.exit(os.EX_NOINPUT) + else: + is_darshan = True and is_darshan + is_recorder = False and is_recorder + else: # check whether is a valid recorder log + if not os.path.isdir(path): + print("Unable to open recorder folder.") + sys.exit(os.EX_NOINPUT) + else: + is_recorder = True and is_recorder + is_darshan = False and is_darshan -def check_log_type(path): - if path.endswith('.darshan'): - if not os.path.isfile(path): - print('Unable to open .darshan file.') + if multiple_logs: + if is_darshan: + return LOG_TYPE_DARSHAN + else: + print("Only .darshan files are supported for multiple logs.") sys.exit(os.EX_NOINPUT) - else: return LOG_TYPE_DARSHAN - else: # check whether is a valid recorder log - if not os.path.isdir(path): - print('Unable to open recorder folder.') + else: + if is_darshan and not is_recorder: + return LOG_TYPE_DARSHAN + elif is_recorder and not is_darshan: + return LOG_TYPE_RECORDER + else: + print("Unable to reliably determine the log type.") sys.exit(os.EX_NOINPUT) - else: return LOG_TYPE_RECORDER def main(): - log_type = check_log_type(args.log_path) - + log_type = check_log_type(args.log_paths) + if log_type == LOG_TYPE_DARSHAN: from drishti.handlers.handle_darshan import handler elif log_type == LOG_TYPE_RECORDER: from drishti.handlers.handle_recorder import handler - + handler() + +if __name__ == "__main__": + main() From 2940cfeeca792fb0477c922019f51a11bd5ed18e Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 24 Mar 2025 12:06:47 -0600 Subject: [PATCH 05/43] chore: Add Black component configuration for Python 3.13 SDK --- .idea/misc.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.idea/misc.xml b/.idea/misc.xml index 1d40550..a366115 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ + + \ No newline at end of file From 4b1a58f0cc44d0155780159620b3597544264429 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 24 Mar 2025 12:14:55 -0600 Subject: [PATCH 06/43] chore: Refactor imports in handle_darshan.py for clarity and organization --- drishti/handlers/handle_darshan.py | 55 ++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index ea690f3..8a16b71 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -1,18 +1,59 @@ #!/usr/bin/env python3 +import csv +import datetime import io -import sys -import time +import os import shlex import shutil import subprocess -import pandas as pd +import sys +import time + import darshan import darshan.backend.cffi_backend as darshanll - -from rich import print +import pandas as pd from packaging import version -from drishti.includes.module import * +from rich import print +from rich.padding import Padding +from rich.panel import Panel + +from drishti.includes.config import ( + HIGH, + RECOMMENDATIONS, + WARN, + init_console, + insights_total, + thresholds, +) +# from drishti.includes.module import * +from drishti.includes.module import ( + check_individual_read_imbalance, + check_individual_write_imbalance, + check_long_metadata, + check_misaligned, + check_mpi_aggregator, + check_mpi_collective_read_operation, + check_mpi_collective_write_operation, + check_mpi_none_block_operation, + check_mpiio, + check_operation_intensive, + check_random_operation, + check_shared_data_imblance, + check_shared_small_operation, + check_shared_time_imbalance, + check_size_intensive, + check_small_operation, + check_stdio, + check_traffic, + display_content, + display_footer, + display_thresholds, + export_csv, + export_html, + export_svg, +) +from drishti.includes.parser import args def is_available(name): @@ -494,7 +535,7 @@ def handler(): detected_files = [] stragglers_count = 0 - stragglers_imbalance = {} + # stragglers_imbalance = {} shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str)) From b09300dde91e3fcefc5cd8acc7525e816f79bba4 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 24 Mar 2025 12:19:24 -0600 Subject: [PATCH 07/43] hotfix: Update log path handling to support multiple log paths and ensure consistency --- drishti/handlers/handle_darshan.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 8a16b71..4fc3c3a 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -116,7 +116,8 @@ def handler(): insights_start_time = time.time() - log = darshanll.log_open(args.log_path) + darshan_log_path = args.log_paths[0] + log = darshanll.log_open(darshan_log_path) modules = darshanll.log_get_modules(log) @@ -129,8 +130,8 @@ def handler(): library_version = darshanll.get_lib_version() # Make sure log format is of the same version - filename = args.log_path - # check_log_version(console, args.log_path, log_version, library_version) + filename = darshan_log_path + # check_log_version(console, darshan_log_path, log_version, library_version) darshanll.log_close(log) @@ -752,7 +753,7 @@ def handler(): job['exe'].split()[0] ), ' [b]DARSHAN[/b]: [white]{}[/white]'.format( - os.path.basename(args.log_path) + os.path.basename(darshan_log_path) ), ' [b]EXECUTION TIME[/b]: [white]{} to {} ({:.2f} hours)[/white]'.format( job_start, @@ -794,7 +795,7 @@ def handler(): display_footer(console, insights_start_time, insights_end_time) # Export to HTML, SVG, and CSV - trace_name = os.path.basename(args.log_path).replace('.darshan', '') + trace_name = os.path.basename(darshan_log_path).replace('.darshan', '') out_dir = args.export_dir if args.export_dir != "" else os.getcwd() export_html(console, out_dir, trace_name) From b6bfec2c7e628bd587b3025823771a163819cee5 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sat, 29 Mar 2025 13:56:23 -0600 Subject: [PATCH 08/43] chore: Update project SDK name from Python 3.13 to uv in IDE configuration files --- .idea/drishti-io.iml | 2 +- .idea/misc.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml index 7b26d7f..c645b1e 100644 --- a/.idea/drishti-io.iml +++ b/.idea/drishti-io.iml @@ -4,7 +4,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index a366115..90404e0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file From fcf1c2519124be546ccd95e75efbc597553c18a9 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sat, 29 Mar 2025 14:00:10 -0600 Subject: [PATCH 09/43] chore: Add Mypy configuration and update setup.py for development dependencies --- .idea/MypyPlugin.xml | 8 ++++++++ setup.py | 7 +++++++ 2 files changed, 15 insertions(+) create mode 100644 .idea/MypyPlugin.xml diff --git a/.idea/MypyPlugin.xml b/.idea/MypyPlugin.xml new file mode 100644 index 0000000..ac4cd76 --- /dev/null +++ b/.idea/MypyPlugin.xml @@ -0,0 +1,8 @@ + + + + + \ No newline at end of file diff --git a/setup.py b/setup.py index a93a8ce..c3b9d6c 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,13 @@ 'rich==12.5.1', 'recorder-utils', ], + extras_require={ + 'dev': [ + 'ruff', + 'isort', + 'mypy' + ], + }, packages=find_packages(), package_data={ 'drishti.includes': [ From 611bfa177c89e39d25ad1ac847a4e8e7963e1c3e Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Tue, 1 Apr 2025 22:14:38 -0600 Subject: [PATCH 10/43] chore: Exclude .history folder from module content in IDE configuration --- .idea/drishti-io.iml | 1 + 1 file changed, 1 insertion(+) diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml index c645b1e..eb59906 100644 --- a/.idea/drishti-io.iml +++ b/.idea/drishti-io.iml @@ -2,6 +2,7 @@ + From f2cdd50eb03f381aedd5bd6343125a9c8fea9498 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Tue, 1 Apr 2025 22:16:16 -0600 Subject: [PATCH 11/43] chore: Exclude sample/tensorflow_unet3d_darshan_per_rank_workload from project configuration --- .gitignore | 2 ++ .idea/drishti-io.iml | 1 + 2 files changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index d3c0162..74cfd33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +sample/tensorflow_unet3d_darshan_per_rank_workload + # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,pycharm,visualstudiocode # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,pycharm,visualstudiocode diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml index eb59906..883789c 100644 --- a/.idea/drishti-io.iml +++ b/.idea/drishti-io.iml @@ -4,6 +4,7 @@ + From fa2cd1969bae6206c2db57ba30b7caf86d3f05c9 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Tue, 1 Apr 2025 22:17:38 -0600 Subject: [PATCH 12/43] refactor: Update argument access to use parser module for consistency --- drishti/handlers/handle_darshan.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 4fc3c3a..c687d4c 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -53,7 +53,8 @@ export_html, export_svg, ) -from drishti.includes.parser import args +import drishti.includes.parser as parser +# from drishti.includes.parser import args def is_available(name): @@ -116,7 +117,7 @@ def handler(): insights_start_time = time.time() - darshan_log_path = args.log_paths[0] + darshan_log_path = parser.args.log_paths[0] log = darshanll.log_open(darshan_log_path) modules = darshanll.log_get_modules(log) @@ -199,12 +200,12 @@ def handler(): if "LUSTRE" in report.records: df_lustre = report.records['LUSTRE'].to_df() - if args.backtrace: + if parser.args.backtrace: if "DXT_POSIX" in report.records: dxt_posix = report.records["DXT_POSIX"].to_df() dxt_posix = pd.DataFrame(dxt_posix) if "address_line_mapping" not in dxt_posix: - args.backtrace = False + parser.args.backtrace = False else: read_id = [] read_rank = [] @@ -349,7 +350,7 @@ def handler(): # Get total number of I/O operations total_operations = total_writes + total_reads - # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance + # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance check_operation_intensive(total_operations, total_reads, total_writes) total_read_size = df['counters']['POSIX_BYTES_READ'].sum() @@ -796,7 +797,7 @@ def handler(): # Export to HTML, SVG, and CSV trace_name = os.path.basename(darshan_log_path).replace('.darshan', '') - out_dir = args.export_dir if args.export_dir != "" else os.getcwd() + out_dir = parser.args.export_dir if parser.args.export_dir != "" else os.getcwd() export_html(console, out_dir, trace_name) export_svg(console, out_dir, trace_name) From 148ee9a74b81c61ade7e47e71511e7fbf121b6db Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Tue, 1 Apr 2025 22:18:20 -0600 Subject: [PATCH 13/43] fix: Add type ignore comments for darshan imports to resolve type checking issues --- drishti/handlers/handle_darshan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index c687d4c..a8453c9 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -10,8 +10,8 @@ import sys import time -import darshan -import darshan.backend.cffi_backend as darshanll +import darshan # type: ignore +import darshan.backend.cffi_backend as darshanll # type: ignore import pandas as pd from packaging import version from rich import print From 783e3b42d4c4a741b0ac2712c79edca9acb70017 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Tue, 1 Apr 2025 22:18:55 -0600 Subject: [PATCH 14/43] feat: Import DarshanFile from darshan_util for enhanced functionality --- drishti/handlers/darshan_util.py | 290 +++++++++++++++++++++++++++++ drishti/handlers/handle_darshan.py | 2 + 2 files changed, 292 insertions(+) create mode 100644 drishti/handlers/darshan_util.py diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py new file mode 100644 index 0000000..f44bb50 --- /dev/null +++ b/drishti/handlers/darshan_util.py @@ -0,0 +1,290 @@ +import datetime +import typing +from dataclasses import dataclass, field +from enum import Enum +from functools import cached_property +from typing import Dict, Final, Optional, Union + +import pandas as pd + + +class ModuleType(str, Enum): + """Enum for standard I/O module types""" + POSIX = "posix" + STDIO = "stdio" + MPIIO = "mpiio" + + def __str__(self) -> str: + return self.value + + +@dataclass +class TimeSpan: + start: datetime.datetime + end: datetime.datetime + + def __post_init__(self): + if self.start > self.end: + raise ValueError(f"TimeSpan start ({self.start}) must be <= end ({self.end})") + +@dataclass +class IOCounter: + """Base class for I/O metrics with read/write counts""" + read: Final[int] = field(init=True) + write: Final[int] = field(init=True) + _total: Optional[int] = None + + @cached_property + def total(self) -> int: + """Total count, calculated once on first access""" + if self._total is not None: + return self._total + return self.read + self.write + +@dataclass +class IOSize(IOCounter): + """Represents I/O size statistics in bytes""" + pass + +@dataclass +class IOOperation(IOCounter): + """Represents I/O operation count statistics""" + pass + + +@dataclass +class IOStatistics: + """Tracks both I/O sizes and operations by module with aggregated metrics""" + # Use dicts to store module-specific data + sizes: Dict[Union[ModuleType, str], IOSize] = field(default_factory=dict) + operations: Dict[Union[ModuleType, str], IOOperation] = field(default_factory=dict) + + def __post_init__(self): + # Initialize standard modules if not present + for module in ModuleType: + # Ensure that the module is either in both sizes and operations or in neither + assert (module in self.sizes) == (module in self.operations), f"Module {module} should be in both sizes and operations or in neither" + + if module not in self.sizes: + self.sizes[module] = IOSize(read=0, write=0) + if module not in self.operations: + self.operations[module] = IOOperation(read=0, write=0) + + # Convenience properties for standard modules + @cached_property + def posix_size(self) -> int: + return self.sizes[ModuleType.POSIX].total + + @cached_property + def stdio_size(self) -> int: + return self.sizes[ModuleType.STDIO].total + + @cached_property + def mpiio_size(self) -> int: + return self.sizes[ModuleType.MPIIO].total + + @cached_property + def posix_ops(self) -> int: + return self.operations[ModuleType.POSIX].total + + @cached_property + def stdio_ops(self) -> int: + return self.operations[ModuleType.STDIO].total + + @cached_property + def mpiio_ops(self) -> int: + return self.operations[ModuleType.MPIIO].total + + # Aggregated size properties + @cached_property + def read_bytes(self) -> int: + """Total bytes read across all modules.""" + return sum(size.read for size in self.sizes.values()) + + @cached_property + def written_bytes(self) -> int: + """Total bytes written across all modules.""" + return sum(size.write for size in self.sizes.values()) + + @cached_property + def total_bytes(self) -> int: + """Total bytes transferred across all modules.""" + return self.read_bytes + self.written_bytes + + # Aggregated operation properties + @cached_property + def reads(self) -> int: + """Total read operations across all modules.""" + return sum(op.read for op in self.operations.values()) + + @cached_property + def writes(self) -> int: + """Total write operations across all modules.""" + return sum(op.write for op in self.operations.values()) + + @cached_property + def total_ops(self) -> int: + """Total operations across all modules.""" + return self.reads + self.writes + + # Methods to get stats for specific modules + def get_module_size(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int: + """Get size statistics for a specific module or all modules if not specified.""" + if module is None and data_type is None: + raise ValueError("Both module and data_type cannot be None") + + if module: + if module not in self.sizes: + raise ValueError(f"Module {module} not found in sizes") + size = self.sizes[module] + if data_type == "read": + return size.read + elif data_type == "write": + return size.write + else: # data_type is None or "total" + return size.total + else: + if data_type == "read": + return self.read_bytes + elif data_type == "write": + return self.written_bytes + else: # data_type is None or "total" + return self.total_bytes + + def get_module_ops(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int: + """Get operation statistics for a specific module or all modules if not specified.""" + if module is None and data_type is None: + raise ValueError("Both module and data_type cannot be None") + + if module: + if module not in self.operations: + raise ValueError(f"Module {module} not found in operations") + ops = self.operations[module] + if data_type == "read": + return ops.read + elif data_type == "write": + return ops.write + else: # data_type is None or "total" + return ops.total + else: + if data_type == "read": + return self.reads + elif data_type == "write": + return self.writes + else: # data_type is None or "total" + return self.total_ops + +@dataclass +class SmallIOStats(IOCounter): + """Statistics for small I/O operations""" + pass # Inherits read/write/total from IOCounter + +@dataclass +class SharedOpsStats(IOCounter): + """Statistics for shared file operations""" + pass # Inherits read/write/total from IOCounter + +@dataclass +class SharedSmallOpsStats(IOCounter): + """Statistics for small shared file operations""" + pass # Inherits read/write/total from IOCounter + +@dataclass +class ConsecutiveIOStats(IOCounter): + """Statistics for consecutive I/O operations""" + pass # Inherits read/write/total from IOCounter + +@dataclass +class SequentialIOStats(IOCounter): + """Statistics for sequential I/O operations""" + pass # Inherits read/write/total from IOCounter + +@dataclass +class RandomIOStats(IOCounter): + """Statistics for random I/O operations""" + pass # Inherits read/write/total from IOCounter + +@dataclass +class MPIIONonBlockingStats(IOCounter): + """Statistics for non-blocking MPI I/O operations""" + pass + +@dataclass +class MPICollectiveIOStats(IOCounter): + """Statistics for collective MPI I/O operations""" + pass + +@dataclass +class MPIIndependentIOStats(IOCounter): + """Statistics for independent MPI I/O operations""" + pass + +@dataclass +class AccessPatternStats: + """Statistics for I/O access patterns by pattern type""" + consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0)) + sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0)) + random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0)) + +@dataclass +class DarshanFile: + # TODO: All fields which are not calculated should be instantly populated and not optional + # TODO: Explore using typeddicts instead of dicts + job_id: Optional[str] = None + log_ver: Optional[str] = None + time: Optional[TimeSpan] = None + exe: Optional[str] = None + modules: Optional[typing.Iterable[str]] = None + name_records: Optional[typing.Dict[str, str]] = None + max_read_offset: Optional[int] = None + max_write_offset: Optional[int] = None + total_files_stdio: Optional[int] = None + total_files_posix: Optional[int] = None + total_files_mpiio: Optional[int] = None + files: Optional[typing.Dict[str, str]] = None + + # Replace individual I/O stats with IOStatistics class + io_stats: Optional[IOStatistics] = None + + # File counts + total_files: Optional[int] = 0 + + # Additional I/O statistics organized by category + small_io: Optional[SmallIOStats] = None + + # Direct alignment fields instead of a class + mem_not_aligned: Optional[int] = None + file_not_aligned: Optional[int] = None + + access_pattern: Optional[AccessPatternStats] = None + + # Use separate classes for shared operations + shared_ops: Optional[SharedOpsStats] = None + shared_small_ops: Optional[SharedSmallOpsStats] = None + + count_long_metadata: Optional[int] = None + posix_shared_data_imbalance_stragglers_count: Optional[int] = None + + has_hdf5_extension: Optional[bool] = None + + mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None + + cb_nodes: Optional[int] = None + number_of_compute_nodes: Optional[int] = None + hints: Optional[list[str]] = None + + timestamp: Optional[TimeSpan] = None + + aggregated: Optional[pd.DataFrame] = None + + mpi_coll_ops: Optional[MPICollectiveIOStats] = None + mpi_indep_ops: Optional[MPIIndependentIOStats] = None + + detected_files_mpi_coll_reads: Optional[pd.DataFrame] = None + detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None + + imbalance_count_posix_shared_time: Optional[int] = None + posix_shared_time_imbalance_detected_files: Optional[tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None + + + diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index a8453c9..ce9e4e4 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -18,6 +18,8 @@ from rich.padding import Padding from rich.panel import Panel +from drishti.handlers.darshan_util import DarshanFile + from drishti.includes.config import ( HIGH, RECOMMENDATIONS, From 90d23e8f1de8eb61fb13eba7029ae067ab6dd7c5 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Tue, 1 Apr 2025 23:00:41 -0600 Subject: [PATCH 15/43] fix: Update default_factory arguments in AccessPatternStats for proper initialization --- drishti/handlers/darshan_util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index f44bb50..175e453 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -222,9 +222,9 @@ class MPIIndependentIOStats(IOCounter): @dataclass class AccessPatternStats: """Statistics for I/O access patterns by pattern type""" - consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0)) - sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0)) - random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0)) + consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True) + sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0), init=True) + random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0), init=True) @dataclass class DarshanFile: From ba3381d813032e9964f137ce6076b91a2e4093a9 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Thu, 3 Apr 2025 11:46:42 -0600 Subject: [PATCH 16/43] fix: Update type hints in for Python 3.8 compatability --- drishti/handlers/darshan_util.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 175e453..6c9090e 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from enum import Enum from functools import cached_property -from typing import Dict, Final, Optional, Union +from typing import Dict, Final, Optional, Union, List, Tuple, Iterable import pandas as pd @@ -234,14 +234,14 @@ class DarshanFile: log_ver: Optional[str] = None time: Optional[TimeSpan] = None exe: Optional[str] = None - modules: Optional[typing.Iterable[str]] = None - name_records: Optional[typing.Dict[str, str]] = None + modules: Optional[Iterable[str]] = None + name_records: Optional[Dict[str, str]] = None max_read_offset: Optional[int] = None max_write_offset: Optional[int] = None total_files_stdio: Optional[int] = None total_files_posix: Optional[int] = None total_files_mpiio: Optional[int] = None - files: Optional[typing.Dict[str, str]] = None + files: Optional[Dict[str, str]] = None # Replace individual I/O stats with IOStatistics class io_stats: Optional[IOStatistics] = None @@ -271,7 +271,7 @@ class DarshanFile: cb_nodes: Optional[int] = None number_of_compute_nodes: Optional[int] = None - hints: Optional[list[str]] = None + hints: Optional[List[str]] = None timestamp: Optional[TimeSpan] = None @@ -284,7 +284,7 @@ class DarshanFile: detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None imbalance_count_posix_shared_time: Optional[int] = None - posix_shared_time_imbalance_detected_files: Optional[tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None + posix_shared_time_imbalance_detected_files: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None From efdd6f6c835968ab2ec6ad4c0e93199b588abe83 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 14 Apr 2025 17:08:30 -0600 Subject: [PATCH 17/43] refactor: Consolidate module function calls under the 'module' namespace for improved organization --- drishti/handlers/handle_darshan.py | 106 +++++++++++++++-------------- 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index ce9e4e4..633238e 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -12,13 +12,14 @@ import darshan # type: ignore import darshan.backend.cffi_backend as darshanll # type: ignore +import numpy as np import pandas as pd from packaging import version from rich import print from rich.padding import Padding from rich.panel import Panel -from drishti.handlers.darshan_util import DarshanFile +from drishti.handlers.darshan_util import DarshanFile, ModuleType from drishti.includes.config import ( HIGH, @@ -28,33 +29,36 @@ insights_total, thresholds, ) + # from drishti.includes.module import * -from drishti.includes.module import ( - check_individual_read_imbalance, - check_individual_write_imbalance, - check_long_metadata, - check_misaligned, - check_mpi_aggregator, - check_mpi_collective_read_operation, - check_mpi_collective_write_operation, - check_mpi_none_block_operation, - check_mpiio, - check_operation_intensive, - check_random_operation, - check_shared_data_imblance, - check_shared_small_operation, - check_shared_time_imbalance, - check_size_intensive, - check_small_operation, - check_stdio, - check_traffic, - display_content, - display_footer, - display_thresholds, - export_csv, - export_html, - export_svg, -) +import drishti.includes.module as module + +# from drishti.includes.module import ( +# check_individual_read_imbalance, +# check_individual_write_imbalance, +# check_long_metadata, +# check_misaligned, +# check_mpi_aggregator, +# check_mpi_collective_read_operation, +# check_mpi_collective_write_operation, +# check_mpi_none_block_operation, +# check_mpiio, +# check_operation_intensive, +# check_random_operation, +# check_shared_data_imblance, +# check_shared_small_operation, +# check_shared_time_imbalance, +# check_size_intensive, +# check_small_operation, +# check_stdio, +# check_traffic, +# display_content, +# display_footer, +# display_thresholds, +# export_csv, +# export_html, +# export_svg, +# ) import drishti.includes.parser as parser # from drishti.includes.parser import args @@ -335,8 +339,8 @@ def handler(): 'mpiio': uses_mpiio } - check_stdio(total_size, total_size_stdio) - check_mpiio(modules) + module.check_stdio(total_size, total_size_stdio) + module.check_mpiio(modules) ######################################################################################################################################################################### @@ -353,14 +357,14 @@ def handler(): total_operations = total_writes + total_reads # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance - check_operation_intensive(total_operations, total_reads, total_writes) + module.check_operation_intensive(total_operations, total_reads, total_writes) total_read_size = df['counters']['POSIX_BYTES_READ'].sum() total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum() total_size = total_written_size + total_read_size - check_size_intensive(total_size, total_read_size, total_written_size) + module.check_size_intensive(total_size, total_read_size, total_written_size) ######################################################################################################################################################################### @@ -404,7 +408,7 @@ def handler(): detected_files.columns = ['id', 'total_reads', 'total_writes'] detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str) - check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) ######################################################################################################################################################################### @@ -413,7 +417,7 @@ def handler(): total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum() total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum() - check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data) + module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data) ######################################################################################################################################################################### @@ -422,7 +426,7 @@ def handler(): max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max() max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max() - check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) ######################################################################################################################################################################### @@ -447,7 +451,7 @@ def handler(): write_random = total_writes - write_consecutive - write_sequential #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100)) - check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) ######################################################################################################################################################################### @@ -493,13 +497,13 @@ def handler(): shared_files['POSIX_SIZE_WRITE_100K_1M'] ) - check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) + module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) ######################################################################################################################################################################### count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])]) - check_long_metadata(count_long_metadata, modules) + module.check_long_metadata(count_long_metadata, modules) # We already have a single line for each shared-file access # To check for stragglers, we can check the difference between the @@ -527,7 +531,7 @@ def handler(): column_names = ['id', 'data_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME @@ -555,7 +559,7 @@ def handler(): column_names = ['id', 'time_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_shared_time_imbalance(stragglers_count, detected_files, file_map) + module.check_shared_time_imbalance(stragglers_count, detected_files, file_map) aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] @@ -584,7 +588,7 @@ def handler(): column_names = ['id', 'write_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data) + module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data) imbalance_count = 0 @@ -600,7 +604,7 @@ def handler(): column_names = ['id', 'read_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data) + module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data) ######################################################################################################################################################################### @@ -635,7 +639,7 @@ def handler(): column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio) + module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio) df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)] @@ -660,7 +664,7 @@ def handler(): column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio) + module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio) ######################################################################################################################################################################### @@ -677,7 +681,7 @@ def handler(): mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum() mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum() - check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) + module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) ######################################################################################################################################################################### @@ -726,7 +730,7 @@ def handler(): NUMBER_OF_COMPUTE_NODES = first['NNodes'] # Do we have one MPI-IO aggregator per node? - check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) + module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) except StopIteration: pass except FileNotFoundError: @@ -793,14 +797,14 @@ def handler(): console.print() - display_content(console) - display_thresholds(console) - display_footer(console, insights_start_time, insights_end_time) + module.display_content(console) + module.display_thresholds(console) + module.display_footer(console, insights_start_time, insights_end_time) # Export to HTML, SVG, and CSV trace_name = os.path.basename(darshan_log_path).replace('.darshan', '') out_dir = parser.args.export_dir if parser.args.export_dir != "" else os.getcwd() - export_html(console, out_dir, trace_name) - export_svg(console, out_dir, trace_name) - export_csv(out_dir, trace_name, job['job']['jobid']) + module.export_html(console, out_dir, trace_name) + module.export_svg(console, out_dir, trace_name) + module.export_csv(out_dir, trace_name, job['job']['jobid']) From c2dcad74f9375f56dc70bb0fe8ea3b4e1e94fa2d Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 14 Apr 2025 17:19:41 -0600 Subject: [PATCH 18/43] fix: Update enum values in ModuleType for consistency with Darshan naming conventions --- drishti/handlers/darshan_util.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 6c9090e..c9e62f8 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -10,10 +10,11 @@ class ModuleType(str, Enum): """Enum for standard I/O module types""" - POSIX = "posix" - STDIO = "stdio" - MPIIO = "mpiio" - + + POSIX = "POSIX" + STDIO = "STDIO" + MPIIO = "MPI-IO" + def __str__(self) -> str: return self.value From 8e7c36d752621537bab95eead5e64a523f0e187e Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 14 Apr 2025 17:20:27 -0600 Subject: [PATCH 19/43] fix: Update IOStatistics to use ModuleType for sizes and operations dictionaries --- drishti/handlers/darshan_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index c9e62f8..19a02ae 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -57,8 +57,8 @@ class IOOperation(IOCounter): class IOStatistics: """Tracks both I/O sizes and operations by module with aggregated metrics""" # Use dicts to store module-specific data - sizes: Dict[Union[ModuleType, str], IOSize] = field(default_factory=dict) - operations: Dict[Union[ModuleType, str], IOOperation] = field(default_factory=dict) + sizes: Dict[ModuleType, IOSize] = field(init=True) + operations: Dict[ModuleType, IOOperation] = field(init=True) def __post_init__(self): # Initialize standard modules if not present From 7eb8211fa5cdf06b555fb71d3c7489516fcfe448 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Wed, 16 Apr 2025 15:24:33 -0600 Subject: [PATCH 20/43] refactor: Enhance DarshanFile class with cached properties for improved I/O statistics and module management --- drishti/handlers/darshan_util.py | 278 ++++++++++++++++++++++++++++- drishti/handlers/handle_darshan.py | 50 +++++- drishti/includes/module.py | 3 +- 3 files changed, 313 insertions(+), 18 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 19a02ae..f0cb5ff 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -3,9 +3,13 @@ from dataclasses import dataclass, field from enum import Enum from functools import cached_property +from os import write from typing import Dict, Final, Optional, Union, List, Tuple, Iterable +import numpy as np import pandas as pd +from darshan import DarshanReport # type: ignore +import drishti.includes.parser as parser class ModuleType(str, Enum): @@ -231,12 +235,14 @@ class AccessPatternStats: class DarshanFile: # TODO: All fields which are not calculated should be instantly populated and not optional # TODO: Explore using typeddicts instead of dicts + file_path: str + _darshan_report: Optional[DarshanReport] = None job_id: Optional[str] = None log_ver: Optional[str] = None time: Optional[TimeSpan] = None exe: Optional[str] = None - modules: Optional[Iterable[str]] = None - name_records: Optional[Dict[str, str]] = None + _modules: Optional[Iterable[str]] = None + _name_records: Optional[Dict[int, str]] = None # Keys are uint64 max_read_offset: Optional[int] = None max_write_offset: Optional[int] = None total_files_stdio: Optional[int] = None @@ -245,20 +251,22 @@ class DarshanFile: files: Optional[Dict[str, str]] = None # Replace individual I/O stats with IOStatistics class - io_stats: Optional[IOStatistics] = None - + _io_stats: Optional[IOStatistics] = None + # File counts total_files: Optional[int] = 0 # Additional I/O statistics organized by category - small_io: Optional[SmallIOStats] = None - + _posix_small_io: Optional[SmallIOStats] = None + + _posix_detected_small_files: Optional[pd.DataFrame] = None + # Direct alignment fields instead of a class mem_not_aligned: Optional[int] = None file_not_aligned: Optional[int] = None - + access_pattern: Optional[AccessPatternStats] = None - + # Use separate classes for shared operations shared_ops: Optional[SharedOpsStats] = None shared_small_ops: Optional[SharedSmallOpsStats] = None @@ -287,5 +295,259 @@ class DarshanFile: imbalance_count_posix_shared_time: Optional[int] = None posix_shared_time_imbalance_detected_files: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None + @cached_property + def report(self) -> DarshanReport: + if self._darshan_report is None: + self._darshan_report = DarshanReport(self.file_path) + return self._darshan_report + + @cached_property + def modules(self) -> Iterable[str]: + if self._modules is None: + self._modules = set(self.report.records.keys()) + return self._modules + + @cached_property + def io_stats(self) -> IOStatistics: + if self._io_stats is None: + # Calculate I/O sizes + sizes: Dict[ModuleType, IOSize] = {} + ops: Dict[ModuleType, IOOperation] = {} + if ModuleType.STDIO in self.modules: + df = self.report.records[ModuleType.STDIO].to_df() + counters = df["counters"] + assert df, "STDIO module data frame is empty" + + stdio_read_size = counters["STDIO_BYTES_READ"].sum() + stdio_write_size = counters["STDIO_BYTES_WRITTEN"].sum() + sizes[ModuleType.STDIO] = IOSize( + read=stdio_read_size, write=stdio_write_size + ) + + stdio_read_ops = counters["STDIO_READS"].sum() + stdio_write_ops = counters["STDIO_WRITES"].sum() + ops[ModuleType.STDIO] = IOOperation( + read=stdio_read_ops, write=stdio_write_ops + ) + + if ModuleType.POSIX in self.modules: + df = self.report.records[ModuleType.POSIX].to_df() + counters = df["counters"] + assert df, "POSIX module data frame is empty" + + posix_write_size = counters["POSIX_BYTES_WRITTEN"].sum() + posix_read_size = counters["POSIX_BYTES_READ"].sum() + sizes[ModuleType.POSIX] = IOSize( + read=posix_read_size, write=posix_write_size + ) + + posix_read_ops = counters["POSIX_READS"].sum() + posix_write_ops = counters["POSIX_WRITES"].sum() + ops[ModuleType.POSIX] = IOOperation( + read=posix_read_ops, write=posix_write_ops + ) + + if ModuleType.MPIIO in self.modules: + df = self.report.records[ModuleType.MPIIO].to_df() + counters = df["counters"] + assert df, "MPIIO module data frame is empty" + + mpiio_write_size = counters["MPIIO_BYTES_WRITTEN"].sum() + mpiio_read_size = counters["MPIIO_BYTES_READ"].sum() + sizes[ModuleType.MPIIO] = IOSize( + read=mpiio_read_size, write=mpiio_write_size + ) + + mpiio_read_ops = -1 + mpiio_write_ops = -1 + ops[ModuleType.MPIIO] = IOOperation( + read=mpiio_read_ops, write=mpiio_write_ops + ) + + self._io_stats = IOStatistics(sizes=sizes, operations=ops) + return self._io_stats + @cached_property + def posix_small_io(self) -> SmallIOStats: + if self._posix_small_io is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + posix_reads_small = ( + posix_counters["POSIX_SIZE_READ_0_100"].sum() + + posix_counters["POSIX_SIZE_READ_100_1K"].sum() + + posix_counters["POSIX_SIZE_READ_1K_10K"].sum() + + posix_counters["POSIX_SIZE_READ_10K_100K"].sum() + + posix_counters["POSIX_SIZE_READ_100K_1M"].sum() + ) + posix_writes_small = ( + posix_counters["POSIX_SIZE_WRITE_0_100"].sum() + + posix_counters["POSIX_SIZE_WRITE_100_1K"].sum() + + posix_counters["POSIX_SIZE_WRITE_1K_10K"].sum() + + posix_counters["POSIX_SIZE_WRITE_10K_100K"].sum() + + posix_counters["POSIX_SIZE_WRITE_100K_1M"].sum() + ) + self._posix_small_io = SmallIOStats( + read=posix_reads_small, write=posix_writes_small + ) + return self._posix_small_io + + @property + def posix_detected_small_files(self) -> pd.DataFrame: + if self._posix_detected_small_files is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + posix_counters["INSIGHTS_POSIX_SMALL_READ"] = ( + posix_counters["POSIX_SIZE_READ_0_100"] + + posix_counters["POSIX_SIZE_READ_100_1K"] + + posix_counters["POSIX_SIZE_READ_1K_10K"] + + posix_counters["POSIX_SIZE_READ_10K_100K"] + + posix_counters["POSIX_SIZE_READ_100K_1M"] + ) + posix_counters["INSIGHTS_POSIX_SMALL_WRITE"] = ( + posix_counters["POSIX_SIZE_WRITE_0_100"] + + posix_counters["POSIX_SIZE_WRITE_100_1K"] + + posix_counters["POSIX_SIZE_WRITE_1K_10K"] + + posix_counters["POSIX_SIZE_WRITE_10K_100K"] + + posix_counters["POSIX_SIZE_WRITE_100K_1M"] + ) + detected_files = pd.DataFrame( + posix_counters.groupby("id")[ + ["INSIGHTS_POSIX_SMALL_READ", "INSIGHTS_POSIX_SMALL_WRITE"] + ].sum() + ).reset_index() + detected_files.columns = pd.Index(["id", "total_reads", "total_writes"]) + detected_files.loc[:, "id"] = detected_files.loc[:, "id"].astype(str) + self._posix_detected_small_files = detected_files + return self._posix_detected_small_files + + @property + def file_map(self) -> Dict[int, str]: + return self.name_records + @cached_property + def name_records(self) -> Dict[int, str]: + if self._name_records is None: + self._name_records = self.report.name_records + return self._name_records + + @property + def dxt_posix_df(self) -> Optional[pd.DataFrame]: + # TODO + # if parser.args.backtrace is False: + # return None + assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" + dxt_posix_df = pd.DataFrame(self.report.records["DXT_POSIX"].to_df()) + return dxt_posix_df + + @property + def dxt_posix_read_df(self) -> Optional[pd.DataFrame]: + if parser.args.backtrace is False: + return None + assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" + df = self.dxt_posix_df + assert df is not None, "Should be handled by parser.args.backtrace check" + + # TODO + # if "address_line_mapping" not in df: + # parser.args.backtrace = False + # return None + + read_id = [] + read_rank = [] + read_length = [] + read_offsets = [] + read_end_time = [] + read_start_time = [] + read_operation = [] + + for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]): + if not r[1].empty: + read_id.append([r[3]] * len((r[1]["length"].to_list()))) + read_rank.append([r[0]] * len((r[1]["length"].to_list()))) + read_length.append(r[1]["length"].to_list()) + read_end_time.append(r[1]["end_time"].to_list()) + read_start_time.append(r[1]["start_time"].to_list()) + read_operation.append(["read"] * len((r[1]["length"].to_list()))) + read_offsets.append(r[1]["offset"].to_list()) + + read_id = [element for nestedlist in read_id for element in nestedlist] + read_rank = [element for nestedlist in read_rank for element in nestedlist] + read_length = [element for nestedlist in read_length for element in nestedlist] + read_offsets = [ + element for nestedlist in read_offsets for element in nestedlist + ] + read_end_time = [ + element for nestedlist in read_end_time for element in nestedlist + ] + read_operation = [ + element for nestedlist in read_operation for element in nestedlist + ] + read_start_time = [ + element for nestedlist in read_start_time for element in nestedlist + ] + + dxt_posix_read_data = { + "id": read_id, + "rank": read_rank, + "length": read_length, + "end_time": read_end_time, + "start_time": read_start_time, + "operation": read_operation, + "offsets": read_offsets, + } + + return pd.DataFrame(dxt_posix_read_data) + + @property + def dxt_posix_write_df(self) -> Optional[pd.DataFrame]: + if parser.args.backtrace is False: + return None + assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" + df = self.dxt_posix_df + assert df is not None, "Should be handled by parser.args.backtrace check" + + # TODO + # if "address_line_mapping" not in df: + # parser.args.backtrace = False + # return None + + write_id = [] + write_rank = [] + write_length = [] + write_offsets = [] + write_end_time = [] + write_start_time = [] + write_operation = [] + + for r in zip(df['rank'], df['read_segments'], df['write_segments'], df['id']): + if not r[2].empty: + write_id.append([r[3]] * len((r[2]['length'].to_list()))) + write_rank.append([r[0]] * len((r[2]['length'].to_list()))) + write_length.append(r[2]['length'].to_list()) + write_end_time.append(r[2]['end_time'].to_list()) + write_start_time.append(r[2]['start_time'].to_list()) + write_operation.append(['write'] * len((r[2]['length'].to_list()))) + write_offsets.append(r[2]['offset'].to_list()) + + + write_id = [element for nestedlist in write_id for element in nestedlist] + write_rank = [element for nestedlist in write_rank for element in nestedlist] + write_length = [element for nestedlist in write_length for element in nestedlist] + write_offsets = [element for nestedlist in write_offsets for element in nestedlist] + write_end_time = [element for nestedlist in write_end_time for element in nestedlist] + write_operation = [element for nestedlist in write_operation for element in nestedlist] + write_start_time = [element for nestedlist in write_start_time for element in nestedlist] + + + dxt_posix_write_data = pd.DataFrame( + { + 'id': write_id, + 'rank': write_rank, + 'length': write_length, + 'end_time': write_end_time, + 'start_time': write_start_time, + 'operation': write_operation, + 'offsets': write_offsets, + }) + + return pd.DataFrame(dxt_posix_write_data) \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 633238e..d58a34b 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -148,6 +148,9 @@ def handler(): job = report.metadata + ######################################################################################################################################################################### + darshan_file_obj = DarshanFile(file_path=darshan_log_path) + ######################################################################################################################################################################### # Check usage of STDIO, POSIX, and MPI-IO per file @@ -205,14 +208,17 @@ def handler(): df_lustre = None if "LUSTRE" in report.records: df_lustre = report.records['LUSTRE'].to_df() - if parser.args.backtrace: if "DXT_POSIX" in report.records: dxt_posix = report.records["DXT_POSIX"].to_df() dxt_posix = pd.DataFrame(dxt_posix) - if "address_line_mapping" not in dxt_posix: - parser.args.backtrace = False + if False: + # if "address_line_mapping" not in dxt_posix: + # parser.args.backtrace = False # TODO + print("Upper") + pass else: + print("ENTERED") read_id = [] read_rank = [] read_length = [] @@ -339,8 +345,10 @@ def handler(): 'mpiio': uses_mpiio } - module.check_stdio(total_size, total_size_stdio) - module.check_mpiio(modules) + # module.check_stdio(total_size, total_size_stdio) + module.check_stdio(total_size=darshan_file_obj.io_stats.total_bytes, total_size_stdio=darshan_file_obj.io_stats.stdio_size) + # module.check_mpiio(modules) + module.check_mpiio(modules=darshan_file_obj.modules) ######################################################################################################################################################################### @@ -357,14 +365,24 @@ def handler(): total_operations = total_writes + total_reads # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance - module.check_operation_intensive(total_operations, total_reads, total_writes) + # module.check_operation_intensive(total_operations, total_reads, total_writes) + module.check_operation_intensive( + total_operations=darshan_file_obj.io_stats.posix_ops, + total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"), + total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"), + ) total_read_size = df['counters']['POSIX_BYTES_READ'].sum() total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum() total_size = total_written_size + total_read_size - module.check_size_intensive(total_size, total_read_size, total_written_size) + # module.check_size_intensive(total_size, total_read_size, total_written_size) + module.check_size_intensive( + total_size=darshan_file_obj.io_stats.total_bytes, + total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"), + total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"), + ) ######################################################################################################################################################################### @@ -408,7 +426,21 @@ def handler(): detected_files.columns = ['id', 'total_reads', 'total_writes'] detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str) - module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + + # module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + parser.args.backtrace = True # TODO + + module.check_small_operation( + total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"), + total_reads_small=darshan_file_obj.posix_small_io.read, + total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"), + total_writes_small=darshan_file_obj.posix_small_io.write, + detected_files=darshan_file_obj.posix_detected_small_files, modules=darshan_file_obj.modules, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df, + ) ######################################################################################################################################################################### @@ -506,7 +538,7 @@ def handler(): module.check_long_metadata(count_long_metadata, modules) # We already have a single line for each shared-file access - # To check for stragglers, we can check the difference between the + # To check for stragglers, we can check the difference between the # POSIX_FASTEST_RANK_BYTES # POSIX_SLOWEST_RANK_BYTES diff --git a/drishti/includes/module.py b/drishti/includes/module.py index e7f70d6..52fac10 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -4,6 +4,7 @@ import datetime import os import time +import typing import pandas as pd from rich import box @@ -73,7 +74,7 @@ def check_stdio(total_size, total_size_stdio): ) -def check_mpiio(modules): +def check_mpiio(modules: typing.Iterable[str]): """ Check whether the application has used MPI-IO or not From 0ff644b56c29e86ed204065d3448d2b88f46657b Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Wed, 16 Apr 2025 15:33:35 -0600 Subject: [PATCH 21/43] hotfix: Remove TODO comments and implement backtrace checks for address line mapping in DXT_POSIX data handling --- drishti/handlers/darshan_util.py | 19 ++++++++----------- drishti/handlers/handle_darshan.py | 10 ++-------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index f0cb5ff..4b1d3b2 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -432,9 +432,8 @@ def name_records(self) -> Dict[int, str]: @property def dxt_posix_df(self) -> Optional[pd.DataFrame]: - # TODO - # if parser.args.backtrace is False: - # return None + if parser.args.backtrace is False: + return None assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" dxt_posix_df = pd.DataFrame(self.report.records["DXT_POSIX"].to_df()) return dxt_posix_df @@ -447,10 +446,9 @@ def dxt_posix_read_df(self) -> Optional[pd.DataFrame]: df = self.dxt_posix_df assert df is not None, "Should be handled by parser.args.backtrace check" - # TODO - # if "address_line_mapping" not in df: - # parser.args.backtrace = False - # return None + if "address_line_mapping" not in df: + parser.args.backtrace = False + return None read_id = [] read_rank = [] @@ -506,10 +504,9 @@ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]: df = self.dxt_posix_df assert df is not None, "Should be handled by parser.args.backtrace check" - # TODO - # if "address_line_mapping" not in df: - # parser.args.backtrace = False - # return None + if "address_line_mapping" not in df: + parser.args.backtrace = False + return None write_id = [] write_rank = [] diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index d58a34b..22ac827 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -212,13 +212,9 @@ def handler(): if "DXT_POSIX" in report.records: dxt_posix = report.records["DXT_POSIX"].to_df() dxt_posix = pd.DataFrame(dxt_posix) - if False: - # if "address_line_mapping" not in dxt_posix: - # parser.args.backtrace = False # TODO - print("Upper") - pass + if "address_line_mapping" not in dxt_posix: + parser.args.backtrace = False else: - print("ENTERED") read_id = [] read_rank = [] read_length = [] @@ -428,8 +424,6 @@ def handler(): # module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) - parser.args.backtrace = True # TODO - module.check_small_operation( total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"), total_reads_small=darshan_file_obj.posix_small_io.read, From 9951500f82c3f3527853b73edf6b94190a23e503 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Wed, 16 Apr 2025 15:40:41 -0600 Subject: [PATCH 22/43] refactor: Introduce cached properties for memory and file alignment checks in Darshan data handling --- drishti/handlers/darshan_util.py | 28 +++++++++++++++++++++++++--- drishti/handlers/handle_darshan.py | 14 ++++++++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 4b1d3b2..831713b 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -262,8 +262,8 @@ class DarshanFile: _posix_detected_small_files: Optional[pd.DataFrame] = None # Direct alignment fields instead of a class - mem_not_aligned: Optional[int] = None - file_not_aligned: Optional[int] = None + _mem_not_aligned: Optional[int] = None + _file_not_aligned: Optional[int] = None access_pattern: Optional[AccessPatternStats] = None @@ -547,4 +547,26 @@ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]: 'offsets': write_offsets, }) - return pd.DataFrame(dxt_posix_write_data) \ No newline at end of file + return pd.DataFrame(dxt_posix_write_data) + + @cached_property + def mem_not_aligned(self) -> int: + if self._mem_not_aligned is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._mem_not_aligned = posix_counters['POSIX_MEM_NOT_ALIGNED'].sum() + return self._mem_not_aligned + + @cached_property + def file_not_aligned(self) -> int: + if self._file_not_aligned is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._file_not_aligned = posix_counters['POSIX_FILE_NOT_ALIGNED'].sum() + return self._file_not_aligned + + @property + def lustre_df(self) -> Optional[pd.DataFrame]: + if "LUSTRE" not in self.modules: + return None + return pd.DataFrame(self.report.records["LUSTRE"].to_df()) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 22ac827..aa803d9 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -358,7 +358,7 @@ def handler(): total_writes = df['counters']['POSIX_WRITES'].sum() # Get total number of I/O operations - total_operations = total_writes + total_reads + total_operations = total_writes + total_reads # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance # module.check_operation_intensive(total_operations, total_reads, total_writes) @@ -443,7 +443,17 @@ def handler(): total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum() total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum() - module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data) + # module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data) + module.check_misaligned( + total_operations=darshan_file_obj.io_stats.posix_ops, + total_mem_not_aligned=darshan_file_obj.mem_not_aligned, + total_file_not_aligned=darshan_file_obj.file_not_aligned, + modules=darshan_file_obj.modules, + file_map=darshan_file_obj.file_map, + df_lustre=darshan_file_obj.lustre_df, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + ) ######################################################################################################################################################################### From 58f4301842b872872f861395590ca898a7c24ff0 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Wed, 16 Apr 2025 15:48:17 -0600 Subject: [PATCH 23/43] refactor: Implement cached properties for max read and write offsets in Darshan data handling --- drishti/handlers/darshan_util.py | 20 ++++++++++++++++++-- drishti/handlers/handle_darshan.py | 11 ++++++++++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 831713b..7b90609 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -243,8 +243,8 @@ class DarshanFile: exe: Optional[str] = None _modules: Optional[Iterable[str]] = None _name_records: Optional[Dict[int, str]] = None # Keys are uint64 - max_read_offset: Optional[int] = None - max_write_offset: Optional[int] = None + _max_read_offset: Optional[int] = None + _max_write_offset: Optional[int] = None total_files_stdio: Optional[int] = None total_files_posix: Optional[int] = None total_files_mpiio: Optional[int] = None @@ -570,3 +570,19 @@ def lustre_df(self) -> Optional[pd.DataFrame]: if "LUSTRE" not in self.modules: return None return pd.DataFrame(self.report.records["LUSTRE"].to_df()) + + @cached_property + def max_read_offset(self) -> int: + if self._max_read_offset is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._max_read_offset = posix_counters['POSIX_MAX_BYTE_READ'].max() + return self._max_read_offset + + @cached_property + def max_write_offset(self) -> int: + if self._max_write_offset is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._max_write_offset = posix_counters['POSIX_MAX_BYTE_WRITTEN'].max() + return self._max_write_offset \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index aa803d9..de6d7ee 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -462,7 +462,16 @@ def handler(): max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max() max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max() - module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + # module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_traffic( + max_read_offset=darshan_file_obj.max_read_offset, + total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"), + max_write_offset=darshan_file_obj.max_write_offset, + total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"), + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df, + ) ######################################################################################################################################################################### From d567b72e35f0068cf95f70a01ab1ebf236e5deee Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Fri, 18 Apr 2025 12:50:54 -0600 Subject: [PATCH 24/43] refactor: Add cached properties for POSIX read and write operation statistics in Darshan data handling --- drishti/handlers/darshan_util.py | 57 +++++++++++++++++++++++++++++- drishti/handlers/handle_darshan.py | 25 ++++++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 7b90609..2654d93 100644 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -265,6 +265,13 @@ class DarshanFile: _mem_not_aligned: Optional[int] = None _file_not_aligned: Optional[int] = None + _posix_read_consecutive: Optional[int] = None + _posix_write_consecutive: Optional[int] = None + _posix_read_sequential: Optional[int] = None + _posix_write_sequential: Optional[int] = None + _posix_read_random: Optional[int] = None + _posix_write_random: Optional[int] = None + access_pattern: Optional[AccessPatternStats] = None # Use separate classes for shared operations @@ -585,4 +592,52 @@ def max_write_offset(self) -> int: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] self._max_write_offset = posix_counters['POSIX_MAX_BYTE_WRITTEN'].max() - return self._max_write_offset \ No newline at end of file + return self._max_write_offset + + @cached_property + def posix_read_consecutive(self) -> int: + if self._posix_read_consecutive is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_read_consecutive = posix_counters['POSIX_CONSEC_READS'].sum() + return self._posix_read_consecutive + + @cached_property + def posix_write_consecutive(self) -> int: + if self._posix_write_consecutive is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_write_consecutive = posix_counters['POSIX_CONSEC_WRITES'].sum() + return self._posix_write_consecutive + + @cached_property + def posix_read_sequential(self) -> int: + if self._posix_read_sequential is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_read_sequential = posix_counters['POSIX_SEQ_READS'].sum() - self.posix_read_consecutive + return self._posix_read_sequential + + @cached_property + def posix_write_sequential(self) -> int: + if self._posix_write_sequential is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_write_sequential = posix_counters['POSIX_SEQ_WRITES'].sum() - self.posix_write_consecutive + return self._posix_write_sequential + + @cached_property + def posix_read_random(self) -> int: + if self._posix_read_random is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_read_random = self.io_stats.get_module_ops(ModuleType.POSIX, "read") - self.posix_read_consecutive - self.posix_read_sequential + return self._posix_read_random + + @cached_property + def posix_write_random(self) -> int: + if self._posix_write_random is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential + return self._posix_write_random \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index de6d7ee..4c6bc6b 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -496,7 +496,30 @@ def handler(): write_random = total_writes - write_consecutive - write_sequential #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100)) - module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + + assert read_consecutive == darshan_file_obj.posix_read_consecutive + assert read_sequential == darshan_file_obj.posix_read_sequential + assert read_random == darshan_file_obj.posix_read_random, f"{read_random} != {darshan_file_obj.posix_read_random}" + assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read")}" + assert write_consecutive == darshan_file_obj.posix_write_consecutive + assert write_sequential == darshan_file_obj.posix_write_sequential + assert write_random == darshan_file_obj.posix_write_random + assert total_writes == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write") + + # module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_random_operation( + read_consecutive=darshan_file_obj.posix_read_consecutive, + read_sequential=darshan_file_obj.posix_read_sequential, + read_random=darshan_file_obj.posix_read_random, + total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), + write_consecutive=darshan_file_obj.posix_write_consecutive, + write_sequential=darshan_file_obj.posix_write_sequential, + write_random=darshan_file_obj.posix_write_random, + total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write"), + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df, + ) ######################################################################################################################################################################### From 97e049c74b8bcae9ab0bc54271b54bfb65a31a49 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sun, 27 Apr 2025 15:13:03 -0600 Subject: [PATCH 25/43] refactor: Add cached property for long metadata count in POSIX data handling --- drishti/handlers/darshan_util.py | 13 ++++++++++++- drishti/handlers/handle_darshan.py | 6 +++++- 2 files changed, 17 insertions(+), 2 deletions(-) mode change 100644 => 100755 drishti/handlers/darshan_util.py diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py old mode 100644 new mode 100755 index 2654d93..176c7ea --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -10,6 +10,7 @@ import pandas as pd from darshan import DarshanReport # type: ignore import drishti.includes.parser as parser +import drishti.includes.config as config class ModuleType(str, Enum): @@ -272,6 +273,8 @@ class DarshanFile: _posix_read_random: Optional[int] = None _posix_write_random: Optional[int] = None + _posix_long_metadata_count: Optional[int] = None + access_pattern: Optional[AccessPatternStats] = None # Use separate classes for shared operations @@ -640,4 +643,12 @@ def posix_write_random(self) -> int: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential - return self._posix_write_random \ No newline at end of file + return self._posix_write_random + + @cached_property + def posix_long_metadata_count(self) -> int: + if self._posix_long_metadata_count is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_long_metadata_rows = posix_df['fcounters'][(posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])] + self._posix_long_metadata_count = len(posix_long_metadata_rows) + return self._posix_long_metadata_count diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 4c6bc6b..579b9f9 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -530,6 +530,7 @@ def handler(): shared_files = shared_files.assign(id=lambda d: d['id'].astype(str)) if not shared_files.empty: + # TODO: This entire conditional total_shared_reads = shared_files['POSIX_READS'].sum() total_shared_reads_small = ( shared_files['POSIX_SIZE_READ_0_100'].sum() + @@ -571,7 +572,10 @@ def handler(): count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])]) - module.check_long_metadata(count_long_metadata, modules) + assert darshan_file_obj.posix_long_metadata_count == count_long_metadata + assert darshan_file_obj.modules == modules.keys(), f"{darshan_file_obj.modules} != {modules.keys()}" + # module.check_long_metadata(count_long_metadata, modules) + module.check_long_metadata(count_long_metadata=darshan_file_obj.posix_long_metadata_count, modules=darshan_file_obj.modules) # We already have a single line for each shared-file access # To check for stragglers, we can check the difference between the From 88c1f54ed5b520ea5d8bd2a8c79365debfa42a4b Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sun, 27 Apr 2025 15:27:16 -0600 Subject: [PATCH 26/43] refactor: Add cached properties for shared read and write operations and stragglers count in Darshan data handling --- drishti/handlers/darshan_util.py | 62 +++++++++++++++++++++++++++++- drishti/handlers/handle_darshan.py | 20 +++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 176c7ea..599405d 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -274,11 +274,12 @@ class DarshanFile: _posix_write_random: Optional[int] = None _posix_long_metadata_count: Optional[int] = None + _posix_stragglers_count: Optional[int] = None access_pattern: Optional[AccessPatternStats] = None # Use separate classes for shared operations - shared_ops: Optional[SharedOpsStats] = None + _shared_ops: Optional[SharedOpsStats] = None shared_small_ops: Optional[SharedSmallOpsStats] = None count_long_metadata: Optional[int] = None @@ -645,6 +646,36 @@ def posix_write_random(self) -> int: self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential return self._posix_write_random + @property + def posix_shared_files_df(self) -> pd.DataFrame: + assert "POSIX" in self.modules, "Missing POSIX module" + posix_df = self.report.records[ModuleType.POSIX].to_df() + shared_files_df = posix_df['counters'].loc[(posix_df['counters']['rank'] == -1)] + shared_files_df = shared_files_df.assign(id=lambda d: d['id'].astype(str)) + return shared_files_df + + @cached_property + def posix_shared_reads(self) -> int: + if self._shared_ops is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._shared_ops = SharedOpsStats( + read=posix_counters["POSIX_SHARED_READS"].sum(), + write=posix_counters["POSIX_SHARED_WRITES"].sum(), + ) + return self._shared_ops.read + + @cached_property + def posix_shared_writes(self) -> int: + if self._shared_ops is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._shared_ops = SharedOpsStats( + read=posix_counters["POSIX_SHARED_READS"].sum(), + write=posix_counters["POSIX_SHARED_WRITES"].sum(), + ) + return self._shared_ops.write + @cached_property def posix_long_metadata_count(self) -> int: if self._posix_long_metadata_count is None: @@ -652,3 +683,32 @@ def posix_long_metadata_count(self) -> int: posix_long_metadata_rows = posix_df['fcounters'][(posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])] self._posix_long_metadata_count = len(posix_long_metadata_rows) return self._posix_long_metadata_count + + @property + def posix_stragglers_df(self) -> pd.DataFrame: + shared_files = self.posix_shared_files_df + + detected_files = [] + + for index, row in shared_files.iterrows(): + total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ'] + + if total_transfer_size and abs( + row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \ + config.thresholds['imbalance_stragglers'][0]: + # stragglers_count += 1 + + detected_files.append([ + row['id'], + abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100 + ]) + + column_names = ['id', 'data_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + return detected_files + + @cached_property + def posix_stragglers_count(self) -> int: + if self._posix_stragglers_count is None: + self._posix_stragglers_count = len(self.posix_stragglers_df) + return self._posix_stragglers_count \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 579b9f9..ec54012 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -566,6 +566,9 @@ def handler(): shared_files['POSIX_SIZE_WRITE_100K_1M'] ) + # module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) + assert total_shared_reads == darshan_file_obj.posix_shared_reads + sys.exit(2) module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) ######################################################################################################################################################################### @@ -603,7 +606,22 @@ def handler(): column_names = ['id', 'data_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + assert stragglers_count == darshan_file_obj.posix_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_stragglers_count}" + assert detected_files.equals(darshan_file_obj.posix_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_stragglers_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" + assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" + assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}" + # module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_shared_data_imblance( + stragglers_count=darshan_file_obj.posix_stragglers_count, + detected_files=darshan_file_obj.posix_stragglers_df, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data = darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data = darshan_file_obj.dxt_posix_write_df + ) + sys.exit(2) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME From 489ee4d54cf8acb0f208f0905a3357bd45aa8acc Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sun, 27 Apr 2025 15:28:39 -0600 Subject: [PATCH 27/43] fmt: `ruff format drishti/handlers/darshan_util.py` --- drishti/handlers/darshan_util.py | 204 ++++++++++++++++++++++--------- 1 file changed, 145 insertions(+), 59 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 599405d..3cfb5d4 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -31,11 +31,15 @@ class TimeSpan: def __post_init__(self): if self.start > self.end: - raise ValueError(f"TimeSpan start ({self.start}) must be <= end ({self.end})") + raise ValueError( + f"TimeSpan start ({self.start}) must be <= end ({self.end})" + ) + @dataclass class IOCounter: """Base class for I/O metrics with read/write counts""" + read: Final[int] = field(init=True) write: Final[int] = field(init=True) _total: Optional[int] = None @@ -47,20 +51,25 @@ def total(self) -> int: return self._total return self.read + self.write + @dataclass class IOSize(IOCounter): """Represents I/O size statistics in bytes""" + pass + @dataclass class IOOperation(IOCounter): """Represents I/O operation count statistics""" + pass @dataclass class IOStatistics: """Tracks both I/O sizes and operations by module with aggregated metrics""" + # Use dicts to store module-specific data sizes: Dict[ModuleType, IOSize] = field(init=True) operations: Dict[ModuleType, IOOperation] = field(init=True) @@ -69,7 +78,9 @@ def __post_init__(self): # Initialize standard modules if not present for module in ModuleType: # Ensure that the module is either in both sizes and operations or in neither - assert (module in self.sizes) == (module in self.operations), f"Module {module} should be in both sizes and operations or in neither" + assert (module in self.sizes) == (module in self.operations), ( + f"Module {module} should be in both sizes and operations or in neither" + ) if module not in self.sizes: self.sizes[module] = IOSize(read=0, write=0) @@ -134,11 +145,15 @@ def total_ops(self) -> int: return self.reads + self.writes # Methods to get stats for specific modules - def get_module_size(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int: + def get_module_size( + self, + module: Optional[Union[ModuleType, str]] = None, + data_type: Optional[str] = "total", + ) -> int: """Get size statistics for a specific module or all modules if not specified.""" if module is None and data_type is None: raise ValueError("Both module and data_type cannot be None") - + if module: if module not in self.sizes: raise ValueError(f"Module {module} not found in sizes") @@ -157,11 +172,15 @@ def get_module_size(self, module: Optional[Union[ModuleType, str]] = None, data_ else: # data_type is None or "total" return self.total_bytes - def get_module_ops(self, module: Optional[Union[ModuleType, str]] = None, data_type: Optional[str] = "total") -> int: + def get_module_ops( + self, + module: Optional[Union[ModuleType, str]] = None, + data_type: Optional[str] = "total", + ) -> int: """Get operation statistics for a specific module or all modules if not specified.""" if module is None and data_type is None: raise ValueError("Both module and data_type cannot be None") - + if module: if module not in self.operations: raise ValueError(f"Module {module} not found in operations") @@ -180,57 +199,84 @@ def get_module_ops(self, module: Optional[Union[ModuleType, str]] = None, data_t else: # data_type is None or "total" return self.total_ops + @dataclass class SmallIOStats(IOCounter): """Statistics for small I/O operations""" + pass # Inherits read/write/total from IOCounter + @dataclass class SharedOpsStats(IOCounter): """Statistics for shared file operations""" + pass # Inherits read/write/total from IOCounter + @dataclass class SharedSmallOpsStats(IOCounter): """Statistics for small shared file operations""" + pass # Inherits read/write/total from IOCounter + @dataclass class ConsecutiveIOStats(IOCounter): """Statistics for consecutive I/O operations""" + pass # Inherits read/write/total from IOCounter + @dataclass class SequentialIOStats(IOCounter): """Statistics for sequential I/O operations""" + pass # Inherits read/write/total from IOCounter + @dataclass class RandomIOStats(IOCounter): """Statistics for random I/O operations""" + pass # Inherits read/write/total from IOCounter + @dataclass class MPIIONonBlockingStats(IOCounter): """Statistics for non-blocking MPI I/O operations""" + pass + @dataclass class MPICollectiveIOStats(IOCounter): """Statistics for collective MPI I/O operations""" + pass + @dataclass class MPIIndependentIOStats(IOCounter): """Statistics for independent MPI I/O operations""" + pass + @dataclass class AccessPatternStats: """Statistics for I/O access patterns by pattern type""" - consecutive: ConsecutiveIOStats = field(default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True) - sequential: SequentialIOStats = field(default_factory=lambda: SequentialIOStats(read=0, write=0), init=True) - random: RandomIOStats = field(default_factory=lambda: RandomIOStats(read=0, write=0), init=True) + + consecutive: ConsecutiveIOStats = field( + default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True + ) + sequential: SequentialIOStats = field( + default_factory=lambda: SequentialIOStats(read=0, write=0), init=True + ) + random: RandomIOStats = field( + default_factory=lambda: RandomIOStats(read=0, write=0), init=True + ) + @dataclass class DarshanFile: @@ -250,13 +296,13 @@ class DarshanFile: total_files_posix: Optional[int] = None total_files_mpiio: Optional[int] = None files: Optional[Dict[str, str]] = None - + # Replace individual I/O stats with IOStatistics class _io_stats: Optional[IOStatistics] = None # File counts total_files: Optional[int] = 0 - + # Additional I/O statistics organized by category _posix_small_io: Optional[SmallIOStats] = None @@ -304,7 +350,9 @@ class DarshanFile: detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None imbalance_count_posix_shared_time: Optional[int] = None - posix_shared_time_imbalance_detected_files: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]] = None + posix_shared_time_imbalance_detected_files: Optional[ + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] + ] = None @cached_property def report(self) -> DarshanReport: @@ -527,36 +575,45 @@ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]: write_start_time = [] write_operation = [] - for r in zip(df['rank'], df['read_segments'], df['write_segments'], df['id']): + for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]): if not r[2].empty: - write_id.append([r[3]] * len((r[2]['length'].to_list()))) - write_rank.append([r[0]] * len((r[2]['length'].to_list()))) - write_length.append(r[2]['length'].to_list()) - write_end_time.append(r[2]['end_time'].to_list()) - write_start_time.append(r[2]['start_time'].to_list()) - write_operation.append(['write'] * len((r[2]['length'].to_list()))) - write_offsets.append(r[2]['offset'].to_list()) - + write_id.append([r[3]] * len((r[2]["length"].to_list()))) + write_rank.append([r[0]] * len((r[2]["length"].to_list()))) + write_length.append(r[2]["length"].to_list()) + write_end_time.append(r[2]["end_time"].to_list()) + write_start_time.append(r[2]["start_time"].to_list()) + write_operation.append(["write"] * len((r[2]["length"].to_list()))) + write_offsets.append(r[2]["offset"].to_list()) write_id = [element for nestedlist in write_id for element in nestedlist] write_rank = [element for nestedlist in write_rank for element in nestedlist] - write_length = [element for nestedlist in write_length for element in nestedlist] - write_offsets = [element for nestedlist in write_offsets for element in nestedlist] - write_end_time = [element for nestedlist in write_end_time for element in nestedlist] - write_operation = [element for nestedlist in write_operation for element in nestedlist] - write_start_time = [element for nestedlist in write_start_time for element in nestedlist] - + write_length = [ + element for nestedlist in write_length for element in nestedlist + ] + write_offsets = [ + element for nestedlist in write_offsets for element in nestedlist + ] + write_end_time = [ + element for nestedlist in write_end_time for element in nestedlist + ] + write_operation = [ + element for nestedlist in write_operation for element in nestedlist + ] + write_start_time = [ + element for nestedlist in write_start_time for element in nestedlist + ] dxt_posix_write_data = pd.DataFrame( { - 'id': write_id, - 'rank': write_rank, - 'length': write_length, - 'end_time': write_end_time, - 'start_time': write_start_time, - 'operation': write_operation, - 'offsets': write_offsets, - }) + "id": write_id, + "rank": write_rank, + "length": write_length, + "end_time": write_end_time, + "start_time": write_start_time, + "operation": write_operation, + "offsets": write_offsets, + } + ) return pd.DataFrame(dxt_posix_write_data) @@ -565,7 +622,7 @@ def mem_not_aligned(self) -> int: if self._mem_not_aligned is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._mem_not_aligned = posix_counters['POSIX_MEM_NOT_ALIGNED'].sum() + self._mem_not_aligned = posix_counters["POSIX_MEM_NOT_ALIGNED"].sum() return self._mem_not_aligned @cached_property @@ -573,7 +630,7 @@ def file_not_aligned(self) -> int: if self._file_not_aligned is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._file_not_aligned = posix_counters['POSIX_FILE_NOT_ALIGNED'].sum() + self._file_not_aligned = posix_counters["POSIX_FILE_NOT_ALIGNED"].sum() return self._file_not_aligned @property @@ -587,7 +644,7 @@ def max_read_offset(self) -> int: if self._max_read_offset is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._max_read_offset = posix_counters['POSIX_MAX_BYTE_READ'].max() + self._max_read_offset = posix_counters["POSIX_MAX_BYTE_READ"].max() return self._max_read_offset @cached_property @@ -595,7 +652,7 @@ def max_write_offset(self) -> int: if self._max_write_offset is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._max_write_offset = posix_counters['POSIX_MAX_BYTE_WRITTEN'].max() + self._max_write_offset = posix_counters["POSIX_MAX_BYTE_WRITTEN"].max() return self._max_write_offset @cached_property @@ -603,7 +660,7 @@ def posix_read_consecutive(self) -> int: if self._posix_read_consecutive is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._posix_read_consecutive = posix_counters['POSIX_CONSEC_READS'].sum() + self._posix_read_consecutive = posix_counters["POSIX_CONSEC_READS"].sum() return self._posix_read_consecutive @cached_property @@ -611,7 +668,7 @@ def posix_write_consecutive(self) -> int: if self._posix_write_consecutive is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._posix_write_consecutive = posix_counters['POSIX_CONSEC_WRITES'].sum() + self._posix_write_consecutive = posix_counters["POSIX_CONSEC_WRITES"].sum() return self._posix_write_consecutive @cached_property @@ -619,7 +676,9 @@ def posix_read_sequential(self) -> int: if self._posix_read_sequential is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._posix_read_sequential = posix_counters['POSIX_SEQ_READS'].sum() - self.posix_read_consecutive + self._posix_read_sequential = ( + posix_counters["POSIX_SEQ_READS"].sum() - self.posix_read_consecutive + ) return self._posix_read_sequential @cached_property @@ -627,7 +686,9 @@ def posix_write_sequential(self) -> int: if self._posix_write_sequential is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._posix_write_sequential = posix_counters['POSIX_SEQ_WRITES'].sum() - self.posix_write_consecutive + self._posix_write_sequential = ( + posix_counters["POSIX_SEQ_WRITES"].sum() - self.posix_write_consecutive + ) return self._posix_write_sequential @cached_property @@ -635,7 +696,11 @@ def posix_read_random(self) -> int: if self._posix_read_random is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._posix_read_random = self.io_stats.get_module_ops(ModuleType.POSIX, "read") - self.posix_read_consecutive - self.posix_read_sequential + self._posix_read_random = ( + self.io_stats.get_module_ops(ModuleType.POSIX, "read") + - self.posix_read_consecutive + - self.posix_read_sequential + ) return self._posix_read_random @cached_property @@ -643,15 +708,19 @@ def posix_write_random(self) -> int: if self._posix_write_random is None: posix_df = self.report.records[ModuleType.POSIX].to_df() posix_counters = posix_df["counters"] - self._posix_write_random = self.io_stats.get_module_ops(ModuleType.POSIX, "write") - self.posix_write_consecutive - self.posix_write_sequential + self._posix_write_random = ( + self.io_stats.get_module_ops(ModuleType.POSIX, "write") + - self.posix_write_consecutive + - self.posix_write_sequential + ) return self._posix_write_random @property def posix_shared_files_df(self) -> pd.DataFrame: assert "POSIX" in self.modules, "Missing POSIX module" posix_df = self.report.records[ModuleType.POSIX].to_df() - shared_files_df = posix_df['counters'].loc[(posix_df['counters']['rank'] == -1)] - shared_files_df = shared_files_df.assign(id=lambda d: d['id'].astype(str)) + shared_files_df = posix_df["counters"].loc[(posix_df["counters"]["rank"] == -1)] + shared_files_df = shared_files_df.assign(id=lambda d: d["id"].astype(str)) return shared_files_df @cached_property @@ -680,7 +749,12 @@ def posix_shared_writes(self) -> int: def posix_long_metadata_count(self) -> int: if self._posix_long_metadata_count is None: posix_df = self.report.records[ModuleType.POSIX].to_df() - posix_long_metadata_rows = posix_df['fcounters'][(posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])] + posix_long_metadata_rows = posix_df["fcounters"][ + ( + posix_df["fcounters"]["POSIX_F_META_TIME"] + > config.thresholds["metadata_time_rank"][0] + ) + ] self._posix_long_metadata_count = len(posix_long_metadata_rows) return self._posix_long_metadata_count @@ -691,24 +765,36 @@ def posix_stragglers_df(self) -> pd.DataFrame: detected_files = [] for index, row in shared_files.iterrows(): - total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ'] + total_transfer_size = row["POSIX_BYTES_WRITTEN"] + row["POSIX_BYTES_READ"] - if total_transfer_size and abs( - row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \ - config.thresholds['imbalance_stragglers'][0]: + if ( + total_transfer_size + and abs( + row["POSIX_SLOWEST_RANK_BYTES"] - row["POSIX_FASTEST_RANK_BYTES"] + ) + / total_transfer_size + > config.thresholds["imbalance_stragglers"][0] + ): # stragglers_count += 1 - detected_files.append([ - row['id'], - abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100 - ]) + detected_files.append( + [ + row["id"], + abs( + row["POSIX_SLOWEST_RANK_BYTES"] + - row["POSIX_FASTEST_RANK_BYTES"] + ) + / total_transfer_size + * 100, + ] + ) - column_names = ['id', 'data_imbalance'] + column_names = ["id", "data_imbalance"] detected_files = pd.DataFrame(detected_files, columns=column_names) - return detected_files + return detected_files @cached_property def posix_stragglers_count(self) -> int: if self._posix_stragglers_count is None: self._posix_stragglers_count = len(self.posix_stragglers_df) - return self._posix_stragglers_count \ No newline at end of file + return self._posix_stragglers_count From 719693edcc3607eda625f35d884d00cd63aeb2e2 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sun, 27 Apr 2025 15:40:25 -0600 Subject: [PATCH 28/43] hotfix: Rename posix_stragglers to posix_data_stragglers --- drishti/handlers/darshan_util.py | 12 ++++++------ drishti/handlers/handle_darshan.py | 9 ++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 3cfb5d4..ee71997 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -320,7 +320,7 @@ class DarshanFile: _posix_write_random: Optional[int] = None _posix_long_metadata_count: Optional[int] = None - _posix_stragglers_count: Optional[int] = None + _posix_data_stragglers_count: Optional[int] = None access_pattern: Optional[AccessPatternStats] = None @@ -759,7 +759,7 @@ def posix_long_metadata_count(self) -> int: return self._posix_long_metadata_count @property - def posix_stragglers_df(self) -> pd.DataFrame: + def posix_data_stragglers_df(self) -> pd.DataFrame: shared_files = self.posix_shared_files_df detected_files = [] @@ -794,7 +794,7 @@ def posix_stragglers_df(self) -> pd.DataFrame: return detected_files @cached_property - def posix_stragglers_count(self) -> int: - if self._posix_stragglers_count is None: - self._posix_stragglers_count = len(self.posix_stragglers_df) - return self._posix_stragglers_count + def posix_data_stragglers_count(self) -> int: + if self._posix_data_stragglers_count is None: + self._posix_data_stragglers_count = len(self.posix_data_stragglers_df) + return self._posix_data_stragglers_count diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index ec54012..67deb1b 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -606,22 +606,21 @@ def handler(): column_names = ['id', 'data_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - assert stragglers_count == darshan_file_obj.posix_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_stragglers_count}" - assert detected_files.equals(darshan_file_obj.posix_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_stragglers_df}" + assert stragglers_count == darshan_file_obj.posix_data_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_data_stragglers_count}" + assert detected_files.equals(darshan_file_obj.posix_data_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_data_stragglers_df}" assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}" # module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) module.check_shared_data_imblance( - stragglers_count=darshan_file_obj.posix_stragglers_count, - detected_files=darshan_file_obj.posix_stragglers_df, + stragglers_count=darshan_file_obj.posix_data_stragglers_count, + detected_files=darshan_file_obj.posix_data_stragglers_df, file_map=darshan_file_obj.file_map, dxt_posix=darshan_file_obj.dxt_posix_df, dxt_posix_read_data = darshan_file_obj.dxt_posix_read_df, dxt_posix_write_data = darshan_file_obj.dxt_posix_write_df ) - sys.exit(2) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME From 668b1e123171596ff525a2d62f1fb7e6c2d126da Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sun, 27 Apr 2025 15:50:24 -0600 Subject: [PATCH 29/43] refactor: Add cached properties for POSIX time stragglers count and DataFrame --- drishti/handlers/darshan_util.py | 39 ++++++++++++++++++++++++++++++ drishti/handlers/handle_darshan.py | 13 +++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index ee71997..bca2800 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -321,6 +321,7 @@ class DarshanFile: _posix_long_metadata_count: Optional[int] = None _posix_data_stragglers_count: Optional[int] = None + _posix_time_stragglers_count: Optional[int] = None access_pattern: Optional[AccessPatternStats] = None @@ -798,3 +799,41 @@ def posix_data_stragglers_count(self) -> int: if self._posix_data_stragglers_count is None: self._posix_data_stragglers_count = len(self.posix_data_stragglers_df) return self._posix_data_stragglers_count + + @property + def posix_time_stragglers_df(self) -> pd.DataFrame: + df = self.report.records[ModuleType.POSIX].to_df() + + shared_files_times = df['fcounters'].loc[(df['fcounters']['rank'] == -1)] + + # Get the files responsible + detected_files = [] + + # stragglers_count = 0 + # stragglers_imbalance = {} + + shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str)) + + for index, row in shared_files_times.iterrows(): + total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME'] + + if total_transfer_time and abs( + row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \ + config.thresholds['imbalance_stragglers'][0]: + # stragglers_count += 1 + + detected_files.append([ + row['id'], + abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100 + ]) + + column_names = ['id', 'time_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def posix_time_stragglers_count(self) -> int: + if self._posix_time_stragglers_count is None: + self._posix_time_stragglers_count = len(self.posix_time_stragglers_df) + return self._posix_time_stragglers_count diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 67deb1b..e036e88 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -648,7 +648,18 @@ def handler(): column_names = ['id', 'time_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - module.check_shared_time_imbalance(stragglers_count, detected_files, file_map) + + assert stragglers_count == darshan_file_obj.posix_time_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_time_stragglers_count}" + assert detected_files.equals(darshan_file_obj.posix_time_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_time_stragglers_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + + # module.check_shared_time_imbalance(stragglers_count, detected_files, file_map) + module.check_shared_time_imbalance( + stragglers_count=darshan_file_obj.posix_time_stragglers_count, + detected_files=darshan_file_obj.posix_time_stragglers_df, + file_map=darshan_file_obj.file_map, + ) + sys.exit(2) aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] From f7fe2c41c12889c696259c1f25630150cd30dedd Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Sun, 27 Apr 2025 15:59:22 -0600 Subject: [PATCH 30/43] refactor: Add cached property for write imbalance count and corresponding DataFrame --- drishti/handlers/darshan_util.py | 43 ++++++++++++++++++++++++++++++ drishti/handlers/handle_darshan.py | 19 +++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index bca2800..2f9a2ad 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -322,6 +322,7 @@ class DarshanFile: _posix_long_metadata_count: Optional[int] = None _posix_data_stragglers_count: Optional[int] = None _posix_time_stragglers_count: Optional[int] = None + _posix_write_imbalance_count: Optional[int] = None access_pattern: Optional[AccessPatternStats] = None @@ -837,3 +838,45 @@ def posix_time_stragglers_count(self) -> int: if self._posix_time_stragglers_count is None: self._posix_time_stragglers_count = len(self.posix_time_stragglers_df) return self._posix_time_stragglers_count + + @property + def posix_write_imbalance_df(self) -> pd.DataFrame: + df = self.report.records[ModuleType.POSIX].to_df() + + aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ + ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] + ].groupby('id', as_index=False).agg({ + 'rank': 'nunique', + 'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'], + 'POSIX_BYTES_READ': ['sum', 'min', 'max'] + }) + + aggregated.columns = list(map('_'.join, aggregated.columns.values)) + + aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str)) + + # Get the files responsible + imbalance_count = 0 + + detected_files = [] + + for index, row in aggregated.iterrows(): + if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \ + row['POSIX_BYTES_WRITTEN_max'] > config.thresholds['imbalance_size'][0]: + imbalance_count += 1 + + detected_files.append([ + row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[ + 'POSIX_BYTES_WRITTEN_max'] * 100 + ]) + + column_names = ['id', 'write_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def posix_write_imbalance_count(self) -> int: + if self._posix_write_imbalance_count is None: + self._posix_write_imbalance_count = len(self.posix_write_imbalance_df) + return self._posix_write_imbalance_count diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index e036e88..57a1e03 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -659,7 +659,6 @@ def handler(): detected_files=darshan_file_obj.posix_time_stragglers_df, file_map=darshan_file_obj.file_map, ) - sys.exit(2) aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] @@ -688,7 +687,23 @@ def handler(): column_names = ['id', 'write_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data) + + assert imbalance_count == darshan_file_obj.posix_write_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_write_imbalance_count}" + assert detected_files.equals(darshan_file_obj.posix_write_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_write_imbalance_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" + assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" + assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}" + + # module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data) + module.check_individual_write_imbalance( + imbalance_count=darshan_file_obj.posix_write_imbalance_count, + detected_files=darshan_file_obj.posix_write_imbalance_df, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df + ) + sys.exit(2) imbalance_count = 0 From 90fcf227326b9338164ba99283e5e11c443e6941 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 28 Apr 2025 14:22:10 -0600 Subject: [PATCH 31/43] refactor: Add cached property for read imbalance count and corresponding DataFrame --- drishti/handlers/darshan_util.py | 43 ++++++++++++++++++++++++++++++ drishti/handlers/handle_darshan.py | 18 +++++++++++-- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 2f9a2ad..64075ee 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -323,6 +323,7 @@ class DarshanFile: _posix_data_stragglers_count: Optional[int] = None _posix_time_stragglers_count: Optional[int] = None _posix_write_imbalance_count: Optional[int] = None + _posix_read_imbalance_count: Optional[int] = None access_pattern: Optional[AccessPatternStats] = None @@ -880,3 +881,45 @@ def posix_write_imbalance_count(self) -> int: if self._posix_write_imbalance_count is None: self._posix_write_imbalance_count = len(self.posix_write_imbalance_df) return self._posix_write_imbalance_count + + @property + def posix_read_imbalance_df(self) -> pd.DataFrame: + df = self.report.records[ModuleType.POSIX].to_df() + + aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ + ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] + ].groupby('id', as_index=False).agg({ + 'rank': 'nunique', + 'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'], + 'POSIX_BYTES_READ': ['sum', 'min', 'max'] + }) + + aggregated.columns = list(map('_'.join, aggregated.columns.values)) + + aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str)) + + + imbalance_count = 0 + + detected_files = [] + + for index, row in aggregated.iterrows(): + if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[ + 'POSIX_BYTES_READ_max'] > config.thresholds['imbalance_size'][0]: + imbalance_count += 1 + + detected_files.append([ + row['id'], + abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100 + ]) + + column_names = ['id', 'read_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def posix_read_imbalance_count(self) -> int: + if self._posix_read_imbalance_count is None: + self._posix_read_imbalance_count = len(self.posix_read_imbalance_df) + return self._posix_read_imbalance_count \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 57a1e03..b576781 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -703,7 +703,6 @@ def handler(): dxt_posix=darshan_file_obj.dxt_posix_df, dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df ) - sys.exit(2) imbalance_count = 0 @@ -719,7 +718,22 @@ def handler(): column_names = ['id', 'read_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data) + + assert imbalance_count == darshan_file_obj.posix_read_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_read_imbalance_count}" + assert detected_files.equals(darshan_file_obj.posix_read_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_read_imbalance_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" + assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" + + # module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data) + module.check_individual_read_imbalance( + imbalance_count=darshan_file_obj.posix_read_imbalance_count, + detected_files=darshan_file_obj.posix_read_imbalance_df, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df + ) + sys.exit(2) ######################################################################################################################################################################### From e8bece15aca7f25611d17cbe9fb866c324e7f9d9 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 28 Apr 2025 15:06:08 -0600 Subject: [PATCH 32/43] hotfix: Store MPI-IO ops in `DarshanFile.io_stats` --- drishti/handlers/darshan_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 64075ee..34c6658 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -420,8 +420,8 @@ def io_stats(self) -> IOStatistics: read=mpiio_read_size, write=mpiio_write_size ) - mpiio_read_ops = -1 - mpiio_write_ops = -1 + mpiio_read_ops = counters['MPIIO_INDEP_READS'].sum() + counters['MPIIO_COLL_READS'].sum() + mpiio_write_ops = counters['MPIIO_INDEP_WRITES'].sum() + counters['MPIIO_COLL_WRITES'].sum() ops[ModuleType.MPIIO] = IOOperation( read=mpiio_read_ops, write=mpiio_write_ops ) From 411ad9f3ede92f1ee3c6ecc0cecb3805691873bd Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 28 Apr 2025 15:38:55 -0600 Subject: [PATCH 33/43] refactor: Add cached properties for MPI collective and independent I/O operations --- drishti/handlers/darshan_util.py | 63 ++++++++++++++++++++++++++++-- drishti/handlers/handle_darshan.py | 23 ++++++++++- 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 34c6658..88e1aeb 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -346,8 +346,8 @@ class DarshanFile: aggregated: Optional[pd.DataFrame] = None - mpi_coll_ops: Optional[MPICollectiveIOStats] = None - mpi_indep_ops: Optional[MPIIndependentIOStats] = None + _mpi_coll_ops: Optional[MPICollectiveIOStats] = None + _mpi_indep_ops: Optional[MPIIndependentIOStats] = None detected_files_mpi_coll_reads: Optional[pd.DataFrame] = None detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None @@ -922,4 +922,61 @@ def posix_read_imbalance_df(self) -> pd.DataFrame: def posix_read_imbalance_count(self) -> int: if self._posix_read_imbalance_count is None: self._posix_read_imbalance_count = len(self.posix_read_imbalance_df) - return self._posix_read_imbalance_count \ No newline at end of file + return self._posix_read_imbalance_count + + @cached_property + def mpi_coll_ops(self) -> MPICollectiveIOStats: + if self._mpi_coll_ops is None: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + mpi_coll_reads = mpi_df['counters']['MPIIO_COLL_READS'].sum() + mpiio_coll_writes = mpi_df['counters']['MPIIO_COLL_WRITES'].sum() + self._mpi_coll_ops = MPICollectiveIOStats(read=mpi_coll_reads, write=mpiio_coll_writes) + return self._mpi_coll_ops + + @cached_property + def mpi_indep_ops(self) -> MPIIndependentIOStats: + if self._mpi_indep_ops is None: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + mpi_indep_reads = mpi_df['counters']['MPIIO_INDEP_READS'].sum() + mpi_indep_writes = mpi_df['counters']['MPIIO_INDEP_WRITES'].sum() + self._mpi_indep_ops = MPIIndependentIOStats(read=mpi_indep_reads, write=mpi_indep_writes) + return self._mpi_indep_ops + + @property + def mpi_read_df(self) -> pd.DataFrame: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + counters = mpi_df['counters'] + mpi_coll_reads = self.mpi_coll_ops.read + mpi_total_reads = self.io_stats.get_module_ops(ModuleType.MPIIO, "read") + + detected_files = [] + + if mpi_coll_reads == 0 and mpi_total_reads and mpi_total_reads > \ + config.thresholds['collective_operations_absolute'][0]: + files = pd.DataFrame(counters.groupby('id').sum()).reset_index() + for index, row in counters.iterrows(): + if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations_absolute'][0]): + detected_files.append([ + row['id'], row['MPIIO_INDEP_READS'], + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 + ]) + + column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @property + def dxt_mpi_df(self) -> Optional[pd.DataFrame]: + if not parser.args.backtrace: + return None + if "DXT_MPIIO" not in self.modules: + return None + + dxt_mpiio = self.report.records["DXT_MPIIO"].to_df() + dxt_mpiio = pd.DataFrame(dxt_mpiio) + return dxt_mpiio \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index b576781..f61e8ca 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -733,7 +733,6 @@ def handler(): dxt_posix=darshan_file_obj.dxt_posix_df, dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df ) - sys.exit(2) ######################################################################################################################################################################### @@ -768,7 +767,27 @@ def handler(): column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] detected_files = pd.DataFrame(detected_files, columns=column_names) - module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio) + assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}" + assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}" + assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read")}" + assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + if dxt_mpiio is None: + assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}" + else: + assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + + # module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio) + module.check_mpi_collective_read_operation( + mpiio_coll_reads=darshan_file_obj.mpi_coll_ops.read, + mpiio_indep_reads=darshan_file_obj.mpi_indep_ops.read, + total_mpiio_read_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), + detected_files=darshan_file_obj.mpi_read_df, + file_map=darshan_file_obj.file_map, + dxt_mpiio=darshan_file_obj.dxt_mpi_df + ) + sys.exit(2) df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)] From 232f049281164e7ebe4c349c075835b7fbfa8723 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 28 Apr 2025 17:47:08 -0600 Subject: [PATCH 34/43] refactor: Add mpi_read_df property and enhance assertions for independent writes --- drishti/handlers/darshan_util.py | 32 ++++++++++++++++++++++++++++- drishti/handlers/handle_darshan.py | 33 +++++++++++++++++++++++++++--- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 88e1aeb..aad3dff 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -979,4 +979,34 @@ def dxt_mpi_df(self) -> Optional[pd.DataFrame]: dxt_mpiio = self.report.records["DXT_MPIIO"].to_df() dxt_mpiio = pd.DataFrame(dxt_mpiio) - return dxt_mpiio \ No newline at end of file + return dxt_mpiio + + @property + def mpi_write_df(self) -> pd.DataFrame: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + counters = mpi_df['counters'] + + mpi_coll_writes = self.mpi_coll_ops.write + total_mpiio_write_operations = self.io_stats.get_module_ops(ModuleType.MPIIO, "write") + + + detected_files = [] + if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \ + config.thresholds['collective_operations_absolute'][0]: + files = pd.DataFrame(counters.groupby('id').sum()).reset_index() + + for index, row in counters.iterrows(): + if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations_absolute'][0]): + detected_files.append([ + row['id'], row['MPIIO_INDEP_WRITES'], + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 + ]) + + column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files \ No newline at end of file diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index f61e8ca..5381d4f 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -770,7 +770,11 @@ def handler(): assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}" assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}" assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read")}" - assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}" + if detected_files.empty: + assert detected_files.empty, f"{detected_files} != {darshan_file_obj.mpi_read_df}" + assert darshan_file_obj.mpi_read_df.empty, f"{darshan_file_obj.mpi_read_df} != {detected_files}" + else: + assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}" assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" if dxt_mpiio is None: assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" @@ -787,7 +791,6 @@ def handler(): file_map=darshan_file_obj.file_map, dxt_mpiio=darshan_file_obj.dxt_mpi_df ) - sys.exit(2) df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)] @@ -812,7 +815,31 @@ def handler(): column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] detected_files = pd.DataFrame(detected_files, columns=column_names) - module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio) + assert mpiio_indep_writes == darshan_file_obj.mpi_indep_ops.write, f"{mpiio_indep_writes} != {darshan_file_obj.mpi_indep_ops.write}" + assert mpiio_coll_writes == darshan_file_obj.mpi_coll_ops.write, f"{mpiio_coll_writes} != {darshan_file_obj.mpi_coll_ops.write}" + assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write")}" + if detected_files.empty: + assert detected_files.empty, f"{detected_files} !={darshan_file_obj.mpi_write_df}" + assert darshan_file_obj.mpi_write_df.empty, f"{darshan_file_obj.mpi_write_df} != {detected_files}" + else: + assert detected_files.equals(darshan_file_obj.mpi_write_df), f"{detected_files} != {darshan_file_obj.mpi_write_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + if dxt_mpiio is None: + assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}" + else: + assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + + # module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio) + module.check_mpi_collective_write_operation( + mpiio_coll_writes=darshan_file_obj.mpi_coll_ops.write, + mpiio_indep_writes=darshan_file_obj.mpi_indep_ops.write, + total_mpiio_write_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), + detected_files=darshan_file_obj.mpi_write_df, + file_map=darshan_file_obj.file_map, + dxt_mpiio=darshan_file_obj.dxt_mpi_df, + ) + sys.exit(2) ######################################################################################################################################################################### From 1dfe5e05302b251044cab6f8667a03c7d16d0360 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Wed, 30 Apr 2025 18:06:27 -0600 Subject: [PATCH 35/43] refactor: Add cached properties for MPI non-blocking I/O operations and HDF5 extension check --- drishti/handlers/darshan_util.py | 26 +++++++++++++++++++++++--- drishti/handlers/handle_darshan.py | 15 +++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index aad3dff..3afa740 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -334,9 +334,9 @@ class DarshanFile: count_long_metadata: Optional[int] = None posix_shared_data_imbalance_stragglers_count: Optional[int] = None - has_hdf5_extension: Optional[bool] = None + _has_hdf5_extension: Optional[bool] = None - mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None + _mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None cb_nodes: Optional[int] = None number_of_compute_nodes: Optional[int] = None @@ -1009,4 +1009,24 @@ def mpi_write_df(self) -> pd.DataFrame: column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] detected_files = pd.DataFrame(detected_files, columns=column_names) - return detected_files \ No newline at end of file + return detected_files + + @cached_property + def mpiio_nb_ops(self) -> MPIIONonBlockingStats: + if self._mpiio_nb_ops is None: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + mpi_nb_reads = mpi_df['counters']['MPIIO_NB_READS'].sum() + mpi_nb_writes = mpi_df['counters']['MPIIO_NB_WRITES'].sum() + self._mpiio_nb_ops = MPIIONonBlockingStats(read=mpi_nb_reads, write=mpi_nb_writes) + return self._mpiio_nb_ops + + @cached_property + def has_hdf5_extension(self) -> bool: + if self._has_hdf5_extension is None: + self._has_hdf5_extension = False + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + for index, row in mpi_df['counters'].iterrows(): + if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'): + self._has_hdf5_extension = True + # break + return self._has_hdf5_extension diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 5381d4f..2ffcacd 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -839,7 +839,6 @@ def handler(): file_map=darshan_file_obj.file_map, dxt_mpiio=darshan_file_obj.dxt_mpi_df, ) - sys.exit(2) ######################################################################################################################################################################### @@ -856,7 +855,19 @@ def handler(): mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum() mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum() - module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) + assert mpiio_nb_reads == darshan_file_obj.mpiio_nb_ops.read + assert mpiio_nb_writes == darshan_file_obj.mpiio_nb_ops.write + assert modules.keys() == darshan_file_obj.modules, f"{modules.keys()} != {darshan_file_obj.modules}" + assert has_hdf5_extension == darshan_file_obj.has_hdf5_extension, f"{has_hdf5_extension} != {darshan_file_obj.has_hdf5_extension}" + + # module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) + module.check_mpi_none_block_operation( + mpiio_nb_reads=darshan_file_obj.mpiio_nb_ops.read, + mpiio_nb_writes=darshan_file_obj.mpiio_nb_ops.write, + has_hdf5_extension=darshan_file_obj.has_hdf5_extension, + modules=darshan_file_obj.modules, + ) + sys.exit(2) ######################################################################################################################################################################### From 2d625287b5c0e082cdd9fa90016a4ed9f700f665 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 5 May 2025 16:51:27 -0600 Subject: [PATCH 36/43] chore: Add Python version 3.8 to the project --- .python-version | 1 + 1 file changed, 1 insertion(+) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..cc1923a --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.8 From 0ec5e9f4cbb1797b7224716b903612adf8edd653 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 5 May 2025 16:53:13 -0600 Subject: [PATCH 37/43] hotfix: Fix f-string formatting in assertions for MPI I/O operations Required for Python 3.8 compatability --- drishti/handlers/handle_darshan.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 2ffcacd..12cdf48 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -500,7 +500,7 @@ def handler(): assert read_consecutive == darshan_file_obj.posix_read_consecutive assert read_sequential == darshan_file_obj.posix_read_sequential assert read_random == darshan_file_obj.posix_read_random, f"{read_random} != {darshan_file_obj.posix_read_random}" - assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read")}" + assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, 'read')}" assert write_consecutive == darshan_file_obj.posix_write_consecutive assert write_sequential == darshan_file_obj.posix_write_sequential assert write_random == darshan_file_obj.posix_write_random @@ -769,7 +769,7 @@ def handler(): assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}" assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}" - assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read")}" + assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'read')}" if detected_files.empty: assert detected_files.empty, f"{detected_files} != {darshan_file_obj.mpi_read_df}" assert darshan_file_obj.mpi_read_df.empty, f"{darshan_file_obj.mpi_read_df} != {detected_files}" @@ -817,7 +817,7 @@ def handler(): assert mpiio_indep_writes == darshan_file_obj.mpi_indep_ops.write, f"{mpiio_indep_writes} != {darshan_file_obj.mpi_indep_ops.write}" assert mpiio_coll_writes == darshan_file_obj.mpi_coll_ops.write, f"{mpiio_coll_writes} != {darshan_file_obj.mpi_coll_ops.write}" - assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write")}" + assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'write')}" if detected_files.empty: assert detected_files.empty, f"{detected_files} !={darshan_file_obj.mpi_write_df}" assert darshan_file_obj.mpi_write_df.empty, f"{darshan_file_obj.mpi_write_df} != {detected_files}" From ab905650245782cfff0f4eedb0cd91653c31fa20 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 5 May 2025 16:54:16 -0600 Subject: [PATCH 38/43] refactor: Add cached properties for compute nodes and enhance assertions in Darshan handling --- drishti/handlers/darshan_util.py | 57 ++++++++++++++++++++++++++++-- drishti/handlers/handle_darshan.py | 9 +++-- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 3afa740..5b90891 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -1,9 +1,14 @@ +import csv import datetime +import io +import subprocess +import sys import typing from dataclasses import dataclass, field from enum import Enum from functools import cached_property from os import write +from shlex import shlex from typing import Dict, Final, Optional, Union, List, Tuple, Iterable import numpy as np @@ -338,8 +343,8 @@ class DarshanFile: _mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None - cb_nodes: Optional[int] = None - number_of_compute_nodes: Optional[int] = None + _cb_nodes: Optional[int] = None + _number_of_compute_nodes: Optional[int] = None hints: Optional[List[str]] = None timestamp: Optional[TimeSpan] = None @@ -1030,3 +1035,51 @@ def has_hdf5_extension(self) -> bool: self._has_hdf5_extension = True # break return self._has_hdf5_extension + + @cached_property + def cb_nodes(self) -> int: + if self._cb_nodes is None: + assert ModuleType.MPIIO in self.modules, "Missing MPIIO module" + hints = "" + if 'h' in self.report.metadata['job']['metadata']: + hints = self.report.metadata['job']['metadata']['h'] + if hints: + hints = hints.split(';') + + cb_nodes = None + + for hint in hints: + if hint != 'no': + (key, value) = hint.split('=') + + if key == 'cb_nodes': + cb_nodes = value + return self._cb_nodes + + @cached_property + def number_of_compute_nodes(self) -> int: + if self._number_of_compute_nodes is None: + assert ModuleType.MPIIO in self.modules, "Missing MPIIO module" + command = 'sacct --job {} --format=JobID,JobIDRaw,NNodes,NCPUs --parsable2 --delimiter ","'.format( + self.report.metadata['job']['jobid'] + ) + arguments = shlex.split(command) + + try: + result = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode == 0: + # We have successfully fetched the information from SLURM + db = csv.DictReader(io.StringIO(result.stdout.decode('utf-8'))) + + try: + first = next(db) + + if 'NNodes' in first: + self._number_of_compute_nodes = first['NNodes'] + + except StopIteration: + pass + except FileNotFoundError: + pass + return self._number_of_compute_nodes diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index 12cdf48..b5d166f 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -867,7 +867,6 @@ def handler(): has_hdf5_extension=darshan_file_obj.has_hdf5_extension, modules=darshan_file_obj.modules, ) - sys.exit(2) ######################################################################################################################################################################### @@ -915,8 +914,14 @@ def handler(): if 'NNodes' in first: NUMBER_OF_COMPUTE_NODES = first['NNodes'] + assert cb_nodes == darshan_file_obj.cb_nodes, f"{cb_nodes} != {darshan_file_obj.cb_nodes}" + assert NUMBER_OF_COMPUTE_NODES == darshan_file_obj.number_of_compute_nodes, f"{NUMBER_OF_COMPUTE_NODES} != {darshan_file_obj.number_of_compute_nodes}" # Do we have one MPI-IO aggregator per node? - module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) + # module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) + module.check_mpi_aggregator( + cb_nodes=darshan_file_obj.cb_nodes, + NUMBER_OF_COMPUTE_NODES=darshan_file_obj.number_of_compute_nodes + ) except StopIteration: pass except FileNotFoundError: From a92723ac5adb920873b44f7eaa7ad07e69324ab9 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Mon, 5 May 2025 16:54:55 -0600 Subject: [PATCH 39/43] hotfix: Enhance lustre_df method to assert single data frame and return components --- drishti/handlers/darshan_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index 5b90891..f3ec671 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -645,7 +645,10 @@ def file_not_aligned(self) -> int: def lustre_df(self) -> Optional[pd.DataFrame]: if "LUSTRE" not in self.modules: return None - return pd.DataFrame(self.report.records["LUSTRE"].to_df()) + lustre_dict = self.report.records["LUSTRE"].to_df() + assert len(lustre_dict) == 1, f"Expected 1 data frame for LUSTRE, got {len(self.report.records['LUSTRE'].to_df())}" + lustre_df = lustre_dict["components"] + return lustre_df @cached_property def max_read_offset(self) -> int: From d611255d48b7120d5e2a670a989aee0f1ceb6321 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Thu, 8 May 2025 10:36:11 -0600 Subject: [PATCH 40/43] hotfix: Handle KeyError in lustre_df method for backward compatibility with older PyDarshan versions --- drishti/handlers/darshan_util.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index f3ec671..e5b6ca4 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -647,7 +647,11 @@ def lustre_df(self) -> Optional[pd.DataFrame]: return None lustre_dict = self.report.records["LUSTRE"].to_df() assert len(lustre_dict) == 1, f"Expected 1 data frame for LUSTRE, got {len(self.report.records['LUSTRE'].to_df())}" - lustre_df = lustre_dict["components"] + try: + lustre_df = lustre_dict["components"] + except KeyError: + # Using an older PyDarshan version + lustre_df = lustre_dict["counters"] return lustre_df @cached_property From 3142d26a68df5990fa816fb4d8b8b42199f3d3b1 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Thu, 8 May 2025 10:43:16 -0600 Subject: [PATCH 41/43] fix: preserve uint64 ID type in HDF5 extension check Avoids using `iterrows()` which implicitly casts Darshan record IDs to float64, potentially breaking file_map lookups for large IDs. Replaced with `itertuples()` to maintain original dtype and re-enabled `break` for early exit. --- drishti/handlers/darshan_util.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py index e5b6ca4..43d9cd3 100755 --- a/drishti/handlers/darshan_util.py +++ b/drishti/handlers/darshan_util.py @@ -1037,10 +1037,12 @@ def has_hdf5_extension(self) -> bool: if self._has_hdf5_extension is None: self._has_hdf5_extension = False mpi_df = self.report.records[ModuleType.MPIIO].to_df() - for index, row in mpi_df['counters'].iterrows(): - if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'): + # for index, row in mpi_df['counters'].iterrows(): # Implicitly converts all data to np.float64. Problematic for id (np.uint64) + for row in mpi_df['counters'].itertuples(index=False): + # if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'): + if self.file_map[row.id].endswith('.h5') or self.file_map[row.id].endswith('.hdf5'): self._has_hdf5_extension = True - # break + break return self._has_hdf5_extension @cached_property From 4afa525f45bf1720755c084f1c363e343d168b30 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Thu, 8 May 2025 10:47:33 -0600 Subject: [PATCH 42/43] chore(ide): add PyCharm run configurations for reporter.py with sample logs Adds Sample_1 and Sample_2 run configs to easily test `reporter.py` on benchmark Darshan logs using the drishti-io module environment. --- .idea/runConfigurations/Sample_1.xml | 26 ++++++++++++++++++++++++++ .idea/runConfigurations/Sample_2.xml | 26 ++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 .idea/runConfigurations/Sample_1.xml create mode 100644 .idea/runConfigurations/Sample_2.xml diff --git a/.idea/runConfigurations/Sample_1.xml b/.idea/runConfigurations/Sample_1.xml new file mode 100644 index 0000000..0bc3377 --- /dev/null +++ b/.idea/runConfigurations/Sample_1.xml @@ -0,0 +1,26 @@ + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/Sample_2.xml b/.idea/runConfigurations/Sample_2.xml new file mode 100644 index 0000000..3c03139 --- /dev/null +++ b/.idea/runConfigurations/Sample_2.xml @@ -0,0 +1,26 @@ + + + + + \ No newline at end of file From fb97e28b8f3305c043a285c2890f6bbc9f88b554 Mon Sep 17 00:00:00 2001 From: Joel Tony Date: Thu, 15 May 2025 15:25:14 -0600 Subject: [PATCH 43/43] hotfix: update total size calculation in check_size_intensive for POSIX module --- drishti/handlers/handle_darshan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index b5d166f..86dcf6f 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -375,7 +375,7 @@ def handler(): # module.check_size_intensive(total_size, total_read_size, total_written_size) module.check_size_intensive( - total_size=darshan_file_obj.io_stats.total_bytes, + total_size=darshan_file_obj.io_stats.posix_size, total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"), total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"), )