From 14706c235e5cc85dbf11219b29991064dd60dbc4 Mon Sep 17 00:00:00 2001 From: mohdsaid497566 <145478595+mohdsaid497566@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:03:39 -0400 Subject: [PATCH 01/15] added flag --rdma-mpi --- toolchain/mfc/args.py | 17 +++++++++-------- toolchain/mfc/test/case.py | 6 ++++-- toolchain/mfc/test/test.py | 13 ++++++++++++- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index e476e0a209..6f2c885bc3 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -76,14 +76,15 @@ def add_common_arguments(p, mask = None): test.add_argument("-l", "--list", action="store_true", help="List all available tests.") test.add_argument("-f", "--from", default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.") test.add_argument("-t", "--to", default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.") - test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.") - test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.") - test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.") - test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.") - test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") - test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) - test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") - test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.") + test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.") + test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.") + test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.") + test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.") + test.add_argument("-r", "--rdma-mpi", action="store_true", default=False, help="Enable RDMA MPI for tests") + test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") + test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) + test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") + test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.") test_meg = test.add_mutually_exclusive_group() test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.") diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 97d4fd121a..61d70772ab 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -133,7 +133,9 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces filepath = f'{self.get_dirpath()}/case.py' tasks = ["-n", str(self.ppn)] jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else [] - case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] + case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] + rdma_mpi_args = ["--rdma-mpi"] if ARG("rdma_mpi") else [] + if self.params.get("bubbles_lagrange", 'F') == 'T': input_bubbles_lagrange(self) @@ -144,7 +146,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces command = [ mfc_script, "run", filepath, "--no-build", *tasks, *case_optimization, - *jobs, "-t", *target_names, *gpus_select, *ARG("--") + *jobs, "-t", *target_names, *gpus_select, *rdma_mpi_args, *ARG("--") ] return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 65cd2803d2..123af36f1d 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -64,6 +64,10 @@ def __filter(cases_) -> typing.List[TestCase]: if any(label in case.trace for label in skip): cases.remove(case) + for case in cases[:]: + if ARG("rdma_mpi") and case.ppn <= 1: + cases.remove(case) + skipped_cases.append(case) if ARG("no_examples"): cases = [case for case in cases if not "Example" in case.trace] @@ -180,7 +184,10 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + if ARG("rdma_mpi"): + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True) + else: + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") common.file_write(out_filepath, cmd.stdout) @@ -224,6 +231,10 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("test_all"): case.delete_output() cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) + if ARG("rdma_mpi"): + cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True) + else: + cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") common.file_write(out_filepath, cmd.stdout) From 0dbef83595d4bef6f15fdd1840029fe084c8f75c Mon Sep 17 00:00:00 2001 From: mohdsaid497566 <145478595+mohdsaid497566@users.noreply.github.com> Date: Wed, 11 Jun 2025 15:23:39 -0400 Subject: [PATCH 02/15] modifications on the flag use --- toolchain/mfc/test/case.py | 4 +--- toolchain/mfc/test/cases.py | 4 +++- toolchain/mfc/test/test.py | 12 +++++++----- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 61d70772ab..12bbc8fb07 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -134,8 +134,6 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces tasks = ["-n", str(self.ppn)] jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else [] case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] - rdma_mpi_args = ["--rdma-mpi"] if ARG("rdma_mpi") else [] - if self.params.get("bubbles_lagrange", 'F') == 'T': input_bubbles_lagrange(self) @@ -146,7 +144,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces command = [ mfc_script, "run", filepath, "--no-build", *tasks, *case_optimization, - *jobs, "-t", *target_names, *gpus_select, *rdma_mpi_args, *ARG("--") + *jobs, "-t", *target_names, *gpus_select, *ARG("--") ] return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py index 77659d7da6..11a9830ff3 100644 --- a/toolchain/mfc/test/cases.py +++ b/toolchain/mfc/test/cases.py @@ -3,6 +3,7 @@ from mfc import common from .case import Nt, define_case_d, define_case_f, CaseGeneratorStack, TestCaseBuilder +from ..state import ARG def get_bc_mods(bc: int, dimInfo): params = {} @@ -320,9 +321,10 @@ def alter_3d(): def alter_ppn(dimInfo): if len(dimInfo[0]) == 3: cases.append(define_case_d(stack, '2 MPI Ranks', {'m': 29, 'n': 29, 'p': 49}, ppn=2)) + cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'m': 29, 'n': 29, 'p': 49, 'rdma_mpi': 'T'}, ppn=2)) else: cases.append(define_case_d(stack, '2 MPI Ranks', {}, ppn=2)) - + cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'rdma_mpi': 'T'}, ppn=2)) def alter_ib(dimInfo, six_eqn_model=False): for slip in [True, False]: diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 123af36f1d..734083e2ad 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -56,7 +56,12 @@ def __filter(cases_) -> typing.List[TestCase]: if case.ppn > 1 and not ARG("mpi"): cases.remove(case) skipped_cases.append(case) - + + for case in cases[:]: + if "RDMA MPI" in case.trace and not ARG("rdma_mpi"): + cases.remove(case) + skipped_cases.append(case) + for case in cases[:]: if ARG("single"): skip = ['low_Mach', 'Hypoelasticity', 'teno', 'Chemistry', 'Phase Change model 6' @@ -184,10 +189,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return - if ARG("rdma_mpi"): - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True) - else: - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") common.file_write(out_filepath, cmd.stdout) From 3c492d421dfab07e4be796ad922f5116410a0abf Mon Sep 17 00:00:00 2001 From: malmahrouqi3 Date: Wed, 11 Jun 2025 16:22:41 -0400 Subject: [PATCH 03/15] flag use changes --- toolchain/mfc/test/case.py | 5 +++-- toolchain/mfc/test/cases.py | 1 - toolchain/mfc/test/test.py | 7 ++++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 12bbc8fb07..f8120f37bc 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -134,7 +134,8 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces tasks = ["-n", str(self.ppn)] jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else [] case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] - + rdma_mpi_args = ["--rdma-mpi"] if ARG("rdma_mpi") else [] + if self.params.get("bubbles_lagrange", 'F') == 'T': input_bubbles_lagrange(self) @@ -144,7 +145,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces command = [ mfc_script, "run", filepath, "--no-build", *tasks, *case_optimization, - *jobs, "-t", *target_names, *gpus_select, *ARG("--") + *jobs, "-t", *target_names, *gpus_select, *rdma_mpi_args, *ARG("--") ] return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py index 11a9830ff3..26e082567a 100644 --- a/toolchain/mfc/test/cases.py +++ b/toolchain/mfc/test/cases.py @@ -3,7 +3,6 @@ from mfc import common from .case import Nt, define_case_d, define_case_f, CaseGeneratorStack, TestCaseBuilder -from ..state import ARG def get_bc_mods(bc: int, dimInfo): params = {} diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 734083e2ad..331ae63a25 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -188,8 +188,13 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("dry_run"): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return + + if ARG("rdma_mpi"): + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True) + else: + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") common.file_write(out_filepath, cmd.stdout) From 8e27e4181245f53ea1f14c7dba6b25b8d9063037 Mon Sep 17 00:00:00 2001 From: malmahrouqi3 Date: Wed, 11 Jun 2025 16:27:59 -0400 Subject: [PATCH 04/15] lint --- toolchain/mfc/test/case.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index f8120f37bc..2a9171586d 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -135,7 +135,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else [] case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] rdma_mpi_args = ["--rdma-mpi"] if ARG("rdma_mpi") else [] - + if self.params.get("bubbles_lagrange", 'F') == 'T': input_bubbles_lagrange(self) From 7fe5f32c2e35e672b758cffec0b50e8506822374 Mon Sep 17 00:00:00 2001 From: mohdsaid497566 <145478595+mohdsaid497566@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:42:11 -0400 Subject: [PATCH 05/15] updated testing docs --- docs/documentation/testing.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/documentation/testing.md b/docs/documentation/testing.md index 15b1ca3618..ea294ffbca 100644 --- a/docs/documentation/testing.md +++ b/docs/documentation/testing.md @@ -16,6 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint - `--percent` (`%`) to specify a percentage of the test suite to select at random and test - `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed - `--no-examples` skips the testing of cases in the examples folder +- `--rdma-mpi` (`-r`) runs additional tests where RDMA MPI is enabled. To specify a computer, pass the `-c` flag to `./mfc.sh run` like so: ```shell From f788dfede915b0c3771c7fee95bb5495fea107eb Mon Sep 17 00:00:00 2001 From: mohdsaid497566 <145478595+mohdsaid497566@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:46:50 -0400 Subject: [PATCH 06/15] RDMAP MPI added to Frontier CI --- .github/workflows/frontier/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 539166e055..4064c33c22 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` if [ "$job_device" == "gpu" ]; then - ./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier + ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier else - ./mfc.sh test --max-attempts 3 -j 32 -- -c frontier + ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier fi From 6aac583f077e027ca31fb1bd8086d3b77341d068 Mon Sep 17 00:00:00 2001 From: Mohammed Said Hamed Humaid Al-Mahrouqi Date: Sun, 15 Jun 2025 14:27:16 -0400 Subject: [PATCH 07/15] fixed stuff --- docs/documentation/testing.md | 2 +- toolchain/mfc/args.py | 2 +- toolchain/mfc/test/case.py | 7 +++---- toolchain/mfc/test/test.py | 8 +------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/docs/documentation/testing.md b/docs/documentation/testing.md index ea294ffbca..e139c5b201 100644 --- a/docs/documentation/testing.md +++ b/docs/documentation/testing.md @@ -16,7 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint - `--percent` (`%`) to specify a percentage of the test suite to select at random and test - `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed - `--no-examples` skips the testing of cases in the examples folder -- `--rdma-mpi` (`-r`) runs additional tests where RDMA MPI is enabled. +- `--rdma-mpi` runs additional tests where RDMA MPI is enabled. To specify a computer, pass the `-c` flag to `./mfc.sh run` like so: ```shell diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index 6f2c885bc3..11f766894e 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -80,7 +80,7 @@ def add_common_arguments(p, mask = None): test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.") test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.") test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.") - test.add_argument("-r", "--rdma-mpi", action="store_true", default=False, help="Enable RDMA MPI for tests") + test.add_argument( "--rdma-mpi", action="store_true", default=False, help="Run tests with RDMA MPI enabled") test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 2a9171586d..51705e2492 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -124,7 +124,7 @@ def __init__(self, trace: str, mods: dict, ppn: int = None, override_tol: float self.override_tol = override_tol super().__init__({**BASE_CFG.copy(), **mods}) - def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subprocess.CompletedProcess: + def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int], rdma_mpi: bool = False) -> subprocess.CompletedProcess: if gpus is not None and len(gpus) != 0: gpus_select = ["--gpus"] + [str(_) for _ in gpus] else: @@ -134,7 +134,6 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces tasks = ["-n", str(self.ppn)] jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else [] case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] - rdma_mpi_args = ["--rdma-mpi"] if ARG("rdma_mpi") else [] if self.params.get("bubbles_lagrange", 'F') == 'T': input_bubbles_lagrange(self) @@ -145,9 +144,9 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces command = [ mfc_script, "run", filepath, "--no-build", *tasks, *case_optimization, - *jobs, "-t", *target_names, *gpus_select, *rdma_mpi_args, *ARG("--") + *jobs, "-t", *target_names, *gpus_select, *ARG("--") ] - + return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def get_trace(self) -> str: diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 331ae63a25..288c5a2b39 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -69,11 +69,6 @@ def __filter(cases_) -> typing.List[TestCase]: if any(label in case.trace for label in skip): cases.remove(case) - for case in cases[:]: - if ARG("rdma_mpi") and case.ppn <= 1: - cases.remove(case) - skipped_cases.append(case) - if ARG("no_examples"): cases = [case for case in cases if not "Example" in case.trace] @@ -189,8 +184,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return - if ARG("rdma_mpi"): - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + if "RDMA MPI" in case.trace: cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True) else: cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) From 902b1ea34f0afcf906a7fcb29597069e02b3fd63 Mon Sep 17 00:00:00 2001 From: mohdsaid497566 Date: Sun, 15 Jun 2025 16:32:16 -0400 Subject: [PATCH 08/15] re-run with lint/format --- toolchain/mfc/test/case.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 51705e2492..12bbc8fb07 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -124,7 +124,7 @@ def __init__(self, trace: str, mods: dict, ppn: int = None, override_tol: float self.override_tol = override_tol super().__init__({**BASE_CFG.copy(), **mods}) - def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int], rdma_mpi: bool = False) -> subprocess.CompletedProcess: + def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subprocess.CompletedProcess: if gpus is not None and len(gpus) != 0: gpus_select = ["--gpus"] + [str(_) for _ in gpus] else: @@ -146,7 +146,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int], rdma_mpi: bo mfc_script, "run", filepath, "--no-build", *tasks, *case_optimization, *jobs, "-t", *target_names, *gpus_select, *ARG("--") ] - + return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def get_trace(self) -> str: From c9b159913e6e687183df49423e5a15693aee32f4 Mon Sep 17 00:00:00 2001 From: mohdsaid497566 <145478595+mohdsaid497566@users.noreply.github.com> Date: Sun, 15 Jun 2025 20:47:21 -0400 Subject: [PATCH 09/15] removed duplicate assignment of CMD --- toolchain/mfc/test/test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 288c5a2b39..b788fec69b 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -183,7 +183,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("dry_run"): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return - + if "RDMA MPI" in case.trace: cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True) else: @@ -231,7 +231,6 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("test_all"): case.delete_output() - cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) if ARG("rdma_mpi"): cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True) else: From 783991674c7d93f6d798900ecd3bd4706fb003fe Mon Sep 17 00:00:00 2001 From: malmahrouqi3 Date: Mon, 16 Jun 2025 10:13:45 -0400 Subject: [PATCH 10/15] redundant code --- toolchain/mfc/test/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index b788fec69b..28c4445462 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -231,10 +231,10 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("test_all"): case.delete_output() - if ARG("rdma_mpi"): - cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True) - else: - cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) + # if ARG("rdma_mpi"): + # cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True) + # else: + cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") common.file_write(out_filepath, cmd.stdout) From 76b1060cbca729ed427ea873a187a17be6f175d2 Mon Sep 17 00:00:00 2001 From: malmahrouqi3 Date: Sat, 28 Jun 2025 03:57:05 -0400 Subject: [PATCH 11/15] added rdma flag to frontier dry run --- .github/workflows/frontier/build.sh | 2 +- toolchain/mfc/test/test.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 4aa0ffe64e..1f442a70b3 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -6,4 +6,4 @@ if [ "$1" == "gpu" ]; then fi . ./mfc.sh load -c f -m g -./mfc.sh test --dry-run -j 8 $build_opts +./mfc.sh test --dry-run --rdma-mpi -j 8 $build_opts diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 28c4445462..5d59db1b06 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -231,9 +231,6 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if ARG("test_all"): case.delete_output() - # if ARG("rdma_mpi"): - # cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True) - # else: cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") common.file_write(out_filepath, cmd.stdout) From ed07060e704e36b77016bcd6750472652653c586 Mon Sep 17 00:00:00 2001 From: Malmahrouqi3 Date: Sun, 29 Jun 2025 00:55:28 -0400 Subject: [PATCH 12/15] dissociate forcefully rdma_mpi & test_all --- .github/workflows/frontier/test.sh | 2 +- toolchain/mfc/test/test.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 4064c33c22..0f86f6df32 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -6,5 +6,5 @@ ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` if [ "$job_device" == "gpu" ]; then ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier else - ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier + ./mfc.sh test -a --max-attempts 3 -j 32 -- -c frontier fi diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 5d59db1b06..0a7067de2c 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -5,7 +5,7 @@ from ..printer import cons from .. import common -from ..state import ARG +from ..state import ARG, ARGS from .case import TestCase from .cases import list_cases from .. import sched @@ -88,7 +88,7 @@ def test(): global errors cases = list_cases() - + # Delete UUIDs that are not in the list of cases from tests/ if ARG("remove_old_tests"): dir_uuids = set(os.listdir(common.MFC_TEST_DIR)) @@ -119,6 +119,9 @@ def test(): # Some cases require a specific build of MFC for features like Chemistry, # Analytically defined patches, and --case-optimization. Here, we build all # the unique versions of MFC we need to run cases. + if ARG("rdma_mpi") == True: + ARGS()["test_all"] = False + codes = [PRE_PROCESS, SIMULATION] + ([POST_PROCESS] if ARG('test_all') else []) unique_builds = set() for case, code in itertools.product(cases, codes): From de73a43d339b025c19e468eab222bbecfc89649e Mon Sep 17 00:00:00 2001 From: Malmahrouqi3 Date: Sun, 29 Jun 2025 00:58:30 -0400 Subject: [PATCH 13/15] dissociate forcefully rdma_mpi from test_all --- .github/workflows/frontier/test.sh | 4 ++-- toolchain/mfc/run/run.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 0f86f6df32..9962090a3c 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` if [ "$job_device" == "gpu" ]; then - ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier + ./mfc.sh test --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier else - ./mfc.sh test -a --max-attempts 3 -j 32 -- -c frontier + ./mfc.sh test --max-attempts 3 -j 32 -- -c frontier fi diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index fb7d528ff9..00a977fcfe 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -13,8 +13,7 @@ from ..common import format_list_to_string, file_dump_yaml from . import queues, input - - + def __validate_job_options() -> None: if not ARG("mpi") and any({ARG("nodes") > 1, ARG("tasks_per_node") > 1}): raise MFCException("RUN: Cannot run on more than one rank with --no-mpi.") From a201637352139a68571dfe77b0bfc66a40f04841 Mon Sep 17 00:00:00 2001 From: Malmahrouqi3 Date: Sun, 29 Jun 2025 01:29:46 -0400 Subject: [PATCH 14/15] lint and quick fix --- toolchain/mfc/run/run.py | 3 ++- toolchain/mfc/test/test.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 00a977fcfe..fb7d528ff9 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -13,7 +13,8 @@ from ..common import format_list_to_string, file_dump_yaml from . import queues, input - + + def __validate_job_options() -> None: if not ARG("mpi") and any({ARG("nodes") > 1, ARG("tasks_per_node") > 1}): raise MFCException("RUN: Cannot run on more than one rank with --no-mpi.") diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 0a7067de2c..aa0baf21cd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -119,7 +119,7 @@ def test(): # Some cases require a specific build of MFC for features like Chemistry, # Analytically defined patches, and --case-optimization. Here, we build all # the unique versions of MFC we need to run cases. - if ARG("rdma_mpi") == True: + if ARG("rdma_mpi") and ARG("dry_run"): ARGS()["test_all"] = False codes = [PRE_PROCESS, SIMULATION] + ([POST_PROCESS] if ARG('test_all') else []) From fd71293b6ddb6f3e9bb22715db07f281c2eed569 Mon Sep 17 00:00:00 2001 From: mohdsaid497566 Date: Thu, 10 Jul 2025 21:15:22 -0400 Subject: [PATCH 15/15] quick fix for rdma_mpi not recognized --- toolchain/mfc/test/test.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 4e202e01bd..5e605be84d 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -192,10 +192,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") return - if "RDMA MPI" in case.trace: - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True) - else: - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")