Skip to content

Test Suite Flag --rdma-mpi Implemented (#598) #878

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
14706c2
added flag --rdma-mpi
Malmahrouqi3 Jun 11, 2025
0dbef83
modifications on the flag use
Malmahrouqi3 Jun 11, 2025
3c492d4
flag use changes
Malmahrouqi3 Jun 11, 2025
8e27e41
lint
Malmahrouqi3 Jun 11, 2025
7fe5f32
updated testing docs
Malmahrouqi3 Jun 11, 2025
f788dfe
RDMAP MPI added to Frontier CI
Malmahrouqi3 Jun 11, 2025
db9d7a0
Merge branch 'rdma_mpi' of https://github.com/mohdsaid497566/MFC-mo2 …
Malmahrouqi3 Jun 11, 2025
8a665b0
Merge branch 'master' into rdma_mpi
sbryngelson Jun 15, 2025
6aac583
fixed stuff
Jun 15, 2025
efa85fb
Merge branch 'rdma_mpi' of https://github.com/mohdsaid497566/MFC-mo2 …
Jun 15, 2025
902b1ea
re-run with lint/format
Jun 15, 2025
c9b1599
removed duplicate assignment of CMD
Malmahrouqi3 Jun 16, 2025
7839916
redundant code
Malmahrouqi3 Jun 16, 2025
43dcc34
Merge branch 'master' into rdma_mpi
sbryngelson Jun 16, 2025
518bf23
Merge branch 'master' into rdma_mpi
Malmahrouqi3 Jun 17, 2025
57b600b
Merge branch 'master' into rdma_mpi
sbryngelson Jun 18, 2025
50d5aea
Merge branch 'master' into rdma_mpi
sbryngelson Jun 21, 2025
df29256
Merge branch 'master' into rdma_mpi
sbryngelson Jun 21, 2025
edbd532
Merge branch 'master' into rdma_mpi
Malmahrouqi3 Jun 21, 2025
52018eb
Merge branch 'MFlowCode:master' into rdma_mpi
Malmahrouqi3 Jun 27, 2025
d707d5f
Merge branch 'MFlowCode:master' into rdma_mpi
Malmahrouqi3 Jun 28, 2025
76b1060
added rdma flag to frontier dry run
Malmahrouqi3 Jun 28, 2025
ed07060
dissociate forcefully rdma_mpi & test_all
Jun 29, 2025
de73a43
dissociate forcefully rdma_mpi from test_all
Jun 29, 2025
a201637
lint and quick fix
Jun 29, 2025
ccf2e4e
Merge branch 'master' into rdma_mpi
Malmahrouqi3 Jun 29, 2025
2a2602b
Merge branch 'master' into rdma_mpi
sbryngelson Jul 1, 2025
317c97d
Merge branch 'master' into rdma_mpi
sbryngelson Jul 3, 2025
6dd75e3
Merge branch 'master' into rdma_mpi
sbryngelson Jul 3, 2025
41239e8
Merge branch 'master' into rdma_mpi
sbryngelson Jul 3, 2025
fd71293
quick fix for rdma_mpi not recognized
Jul 11, 2025
b09bdcd
Merge branch 'master' into rdma_mpi
Malmahrouqi3 Jul 11, 2025
c0169e7
Merge branch 'master' into rdma_mpi
Malmahrouqi3 Jul 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ fi

. ./mfc.sh load -c f -m g

if [ "$2" == "bench" ]; then
if [ "$2" = "bench" ]; then
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
done
else
./mfc.sh test --dry-run -j 8 $build_opts
fi
./mfc.sh test --dry-run --rdma-mpi -j 8 $build_opts
fi
2 changes: 1 addition & 1 deletion .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n'
ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`

if [ "$job_device" = "gpu" ]; then
./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier
./mfc.sh test --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier
else
./mfc.sh test --max-attempts 3 -j 32 -- -c frontier
fi
1 change: 1 addition & 0 deletions docs/documentation/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint
- `--percent` (`%`) to specify a percentage of the test suite to select at random and test
- `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed
- `--no-examples` skips the testing of cases in the examples folder
- `--rdma-mpi` runs additional tests where RDMA MPI is enabled.

To specify a computer, pass the `-c` flag to `./mfc.sh run` like so:
```shell
Expand Down
17 changes: 9 additions & 8 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,15 @@ def add_common_arguments(p, mask = None):
test.add_argument("-l", "--list", action="store_true", help="List all available tests.")
test.add_argument("-f", "--from", default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.")
test.add_argument("-t", "--to", default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.")
test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.")
test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.")
test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")
test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.")
test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.")
test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
test.add_argument( "--rdma-mpi", action="store_true", default=False, help="Run tests with RDMA MPI enabled")
test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")

test_meg = test.add_mutually_exclusive_group()
test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.")
Expand Down
2 changes: 1 addition & 1 deletion toolchain/mfc/test/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
filepath = f'{self.get_dirpath()}/case.py'
tasks = ["-n", str(self.ppn)]
jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else []
case_optimization = ["--case-optimization"] if ARG("case_optimization") else []
case_optimization = ["--case-optimization"] if ARG("case_optimization") else []

if self.params.get("bubbles_lagrange", 'F') == 'T':
input_bubbles_lagrange(self)
Expand Down
3 changes: 2 additions & 1 deletion toolchain/mfc/test/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,10 @@ def alter_3d():
def alter_ppn(dimInfo):
if len(dimInfo[0]) == 3:
cases.append(define_case_d(stack, '2 MPI Ranks', {'m': 29, 'n': 29, 'p': 49}, ppn=2))
cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'m': 29, 'n': 29, 'p': 49, 'rdma_mpi': 'T'}, ppn=2))
else:
cases.append(define_case_d(stack, '2 MPI Ranks', {}, ppn=2))

cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'rdma_mpi': 'T'}, ppn=2))

def alter_ib(dimInfo, six_eqn_model=False):
for slip in [True, False]:
Expand Down
16 changes: 12 additions & 4 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ..printer import cons
from .. import common
from ..state import ARG
from ..state import ARG, ARGS
from .case import TestCase
from .cases import list_cases
from .. import sched
Expand Down Expand Up @@ -58,7 +58,12 @@ def __filter(cases_) -> typing.List[TestCase]:
if case.ppn > 1 and not ARG("mpi"):
cases.remove(case)
skipped_cases.append(case)


for case in cases[:]:
if "RDMA MPI" in case.trace and not ARG("rdma_mpi"):
cases.remove(case)
skipped_cases.append(case)

for case in cases[:]:
if ARG("single"):
skip = ['low_Mach', 'Hypoelasticity', 'teno', 'Chemistry', 'Phase Change model 6'
Expand All @@ -67,7 +72,6 @@ def __filter(cases_) -> typing.List[TestCase]:
cases.remove(case)
skipped_cases.append(case)


if ARG("no_examples"):
example_cases = [case for case in cases if "Example" in case.trace]
skipped_cases += example_cases
Expand All @@ -89,7 +93,7 @@ def test():
global errors

cases = list_cases()

# Delete UUIDs that are not in the list of cases from tests/
if ARG("remove_old_tests"):
dir_uuids = set(os.listdir(common.MFC_TEST_DIR))
Expand Down Expand Up @@ -120,6 +124,9 @@ def test():
# Some cases require a specific build of MFC for features like Chemistry,
# Analytically defined patches, and --case-optimization. Here, we build all
# the unique versions of MFC we need to run cases.
if ARG("rdma_mpi") and ARG("dry_run"):
ARGS()["test_all"] = False

codes = [PRE_PROCESS, SIMULATION] + ([POST_PROCESS] if ARG('test_all') else [])
unique_builds = set()
for case, code in itertools.product(cases, codes):
Expand Down Expand Up @@ -186,6 +193,7 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
return

cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)

out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")

common.file_write(out_filepath, cmd.stdout)
Expand Down
Loading