diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index c2e189342..eb182d69a 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -7,11 +7,11 @@ fi . ./mfc.sh load -c f -m g -if [ "$2" == "bench" ]; then +if [ "$2" = "bench" ]; then for dir in benchmarks/*/; do dirname=$(basename "$dir") ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts done else - ./mfc.sh test --dry-run -j 8 $build_opts -fi + ./mfc.sh test --dry-run --rdma-mpi -j 8 $build_opts +fi \ No newline at end of file diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 57481fa94..8eca65320 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` if [ "$job_device" = "gpu" ]; then - ./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier + ./mfc.sh test --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier else ./mfc.sh test --max-attempts 3 -j 32 -- -c frontier fi diff --git a/docs/documentation/testing.md b/docs/documentation/testing.md index 15b1ca361..e139c5b20 100644 --- a/docs/documentation/testing.md +++ b/docs/documentation/testing.md @@ -16,6 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint - `--percent` (`%`) to specify a percentage of the test suite to select at random and test - `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed - `--no-examples` skips the testing of cases in the examples folder +- `--rdma-mpi` runs additional tests where RDMA MPI is enabled. To specify a computer, pass the `-c` flag to `./mfc.sh run` like so: ```shell diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index e476e0a20..11f766894 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -76,14 +76,15 @@ def add_common_arguments(p, mask = None): test.add_argument("-l", "--list", action="store_true", help="List all available tests.") test.add_argument("-f", "--from", default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.") test.add_argument("-t", "--to", default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.") - test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.") - test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.") - test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.") - test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.") - test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") - test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) - test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") - test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.") + test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.") + test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.") + test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.") + test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.") + test.add_argument( "--rdma-mpi", action="store_true", default=False, help="Run tests with RDMA MPI enabled") + test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") + test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) + test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") + test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.") test_meg = test.add_mutually_exclusive_group() test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.") diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index 97d4fd121..12bbc8fb0 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -133,7 +133,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces filepath = f'{self.get_dirpath()}/case.py' tasks = ["-n", str(self.ppn)] jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else [] - case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] + case_optimization = ["--case-optimization"] if ARG("case_optimization") else [] if self.params.get("bubbles_lagrange", 'F') == 'T': input_bubbles_lagrange(self) diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py index 6e1027d9c..1402bba5e 100644 --- a/toolchain/mfc/test/cases.py +++ b/toolchain/mfc/test/cases.py @@ -320,9 +320,10 @@ def alter_3d(): def alter_ppn(dimInfo): if len(dimInfo[0]) == 3: cases.append(define_case_d(stack, '2 MPI Ranks', {'m': 29, 'n': 29, 'p': 49}, ppn=2)) + cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'m': 29, 'n': 29, 'p': 49, 'rdma_mpi': 'T'}, ppn=2)) else: cases.append(define_case_d(stack, '2 MPI Ranks', {}, ppn=2)) - + cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'rdma_mpi': 'T'}, ppn=2)) def alter_ib(dimInfo, six_eqn_model=False): for slip in [True, False]: diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 28c3cfb53..5e605be84 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -5,7 +5,7 @@ from ..printer import cons from .. import common -from ..state import ARG +from ..state import ARG, ARGS from .case import TestCase from .cases import list_cases from .. import sched @@ -58,7 +58,12 @@ def __filter(cases_) -> typing.List[TestCase]: if case.ppn > 1 and not ARG("mpi"): cases.remove(case) skipped_cases.append(case) - + + for case in cases[:]: + if "RDMA MPI" in case.trace and not ARG("rdma_mpi"): + cases.remove(case) + skipped_cases.append(case) + for case in cases[:]: if ARG("single"): skip = ['low_Mach', 'Hypoelasticity', 'teno', 'Chemistry', 'Phase Change model 6' @@ -67,7 +72,6 @@ def __filter(cases_) -> typing.List[TestCase]: cases.remove(case) skipped_cases.append(case) - if ARG("no_examples"): example_cases = [case for case in cases if "Example" in case.trace] skipped_cases += example_cases @@ -89,7 +93,7 @@ def test(): global errors cases = list_cases() - + # Delete UUIDs that are not in the list of cases from tests/ if ARG("remove_old_tests"): dir_uuids = set(os.listdir(common.MFC_TEST_DIR)) @@ -120,6 +124,9 @@ def test(): # Some cases require a specific build of MFC for features like Chemistry, # Analytically defined patches, and --case-optimization. Here, we build all # the unique versions of MFC we need to run cases. + if ARG("rdma_mpi") and ARG("dry_run"): + ARGS()["test_all"] = False + codes = [PRE_PROCESS, SIMULATION] + ([POST_PROCESS] if ARG('test_all') else []) unique_builds = set() for case, code in itertools.product(cases, codes): @@ -186,6 +193,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): return cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) + out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") common.file_write(out_filepath, cmd.stdout)