From 8d8e25ad8b990e2cf41c8d199c0fba8f8f684e5b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 20 Apr 2025 08:30:57 +0200 Subject: [PATCH 1/5] {2023.06}[2023a,a64fx] add TensorFlow 2.13 --- .../2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml | 14 +++++++------- eb_hooks.py | 7 ++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml b/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml index fe1e7d3631..c5269d459a 100644 --- a/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml @@ -45,13 +45,13 @@ easyconfigs: - nsync-1.26.0-GCCcore-12.3.0.eb - RE2-2023-08-01-GCCcore-12.3.0.eb - protobuf-python-4.24.0-GCCcore-12.3.0.eb -## originally built with EB 4.8.2; PR 19268 included since EB 4.9.0 -## - TensorFlow-2.13.0-foss-2023a.eb: -## # patch setup.py for grpcio extension in TensorFlow 2.13.0 easyconfigs to take into account alternate sysroot; -## # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19268 -## options: -## from-pr: 19268 -# - TensorFlow-2.13.0-foss-2023a.eb +# originally built with EB 4.8.2; PR 19268 included since EB 4.9.0 +# - TensorFlow-2.13.0-foss-2023a.eb: +# # patch setup.py for grpcio extension in TensorFlow 2.13.0 easyconfigs to take into account alternate sysroot; +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19268 +# options: +# from-pr: 19268 + - TensorFlow-2.13.0-foss-2023a.eb # - X11-20230603-GCCcore-12.3.0.eb ## originally built with EB 4.8.2; PR 19339 included since EB 4.9.0 ## - HarfBuzz-5.3.1-GCCcore-12.3.0.eb: diff --git a/eb_hooks.py b/eb_hooks.py index 4cf4b7a1fb..f13d2b3f1d 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -132,7 +132,12 @@ def post_ready_hook(self, *args, **kwargs): memory_hungry_build_a64fx = cpu_target == CPU_TARGET_A64FX and self.name in ['Qt5', 'ROOT'] if memory_hungry_build or memory_hungry_build_a64fx: parallel = self.cfg['parallel'] - if parallel > 1: + if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']: + if parallel > 1: + self.cfg['parallel'] = 2 + msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" + print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log) + elif parallel > 1: self.cfg['parallel'] = parallel // 2 msg = "limiting parallelism to %s (was %s) for %s to avoid out-of-memory failures during building/testing" print_msg(msg % (self.cfg['parallel'], parallel, self.name), log=self.log) From aaccd38a8fd330c1c0afb6e228b349d5a2ad231e Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 20 Apr 2025 19:12:11 +0200 Subject: [PATCH 2/5] set parallel = 2 --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index f13d2b3f1d..4ea205a6cd 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -134,7 +134,7 @@ def post_ready_hook(self, *args, **kwargs): parallel = self.cfg['parallel'] if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']: if parallel > 1: - self.cfg['parallel'] = 2 + self.cfg['parallel'] = 8 msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log) elif parallel > 1: From 3038ee530f7dbf4daeeb440e2dd65c221ac806ce Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 20 Apr 2025 19:16:48 +0200 Subject: [PATCH 3/5] set parallel = 12 --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 4ea205a6cd..1dc426df9c 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -134,7 +134,7 @@ def post_ready_hook(self, *args, **kwargs): parallel = self.cfg['parallel'] if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']: if parallel > 1: - self.cfg['parallel'] = 8 + self.cfg['parallel'] = 12 msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log) elif parallel > 1: From b2842aefc97101be7ab04289d9ccb933725f619b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 20 Apr 2025 19:18:07 +0200 Subject: [PATCH 4/5] set parallel = 16 --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 1dc426df9c..4f2cce40d2 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -134,7 +134,7 @@ def post_ready_hook(self, *args, **kwargs): parallel = self.cfg['parallel'] if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']: if parallel > 1: - self.cfg['parallel'] = 12 + self.cfg['parallel'] = 16 msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log) elif parallel > 1: From 30b5a73e804d650b79ef9abd3aea2d450ebcbe10 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 21 Apr 2025 08:19:30 +0200 Subject: [PATCH 5/5] hardcode parallel = 8 --- eb_hooks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 4f2cce40d2..5f5405c173 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -133,8 +133,9 @@ def post_ready_hook(self, *args, **kwargs): if memory_hungry_build or memory_hungry_build_a64fx: parallel = self.cfg['parallel'] if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']: - if parallel > 1: - self.cfg['parallel'] = 16 + # limit parallelism to 8, builds with 12 and 16 failed on Deucalion + if parallel > 8: + self.cfg['parallel'] = 8 msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log) elif parallel > 1: