Skip to content

Commit 2fa9a75

Browse files
authored
phlop runtime process monitoring (#881)
1 parent bf121b3 commit 2fa9a75

File tree

10 files changed

+139
-26
lines changed

10 files changed

+139
-26
lines changed

pyphare/pyphare/cpp/validate.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,12 @@ def log_runtime_config():
107107
git_hash=get_git_hash(),
108108
)
109109

110+
rank_info_dir = DOT_PHARE_DIR / "rank_info"
110111
if cpp_lib.mpi_rank() == 0:
111-
DOT_PHARE_DIR.mkdir(exist_ok=True, parents=True)
112+
rank_info_dir.mkdir(exist_ok=True, parents=True)
112113
cpp_lib.mpi_barrier()
113114

114-
rank_dir = DOT_PHARE_DIR / f"rank_{cpp_lib.mpi_rank()}"
115+
rank_dir = rank_info_dir / f"{cpp_lib.mpi_rank()}"
115116
rank_dir.mkdir(exist_ok=True)
116117

117118
with open(rank_dir / "runtime_config.json", "w") as f:
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from pathlib import Path
2+
3+
4+
def have_phlop():
5+
from importlib.util import find_spec
6+
7+
try:
8+
return find_spec("phlop.dict") is not None
9+
except (ImportError, ModuleNotFoundError):
10+
return False
11+
12+
13+
def valdict(**kwargs):
14+
if not have_phlop():
15+
return dict
16+
17+
from phlop.dict import ValDict # pylint: disable=import-error
18+
19+
return ValDict(**kwargs)
20+
21+
22+
_globals = valdict(stats_man=None)
23+
24+
25+
def monitoring_yaml_file(cpplib):
26+
path = Path(".phare") / "stats" / f"rank.{cpplib.mpi_rank()}.yaml"
27+
path.parent.mkdir(exist_ok=True, parents=True)
28+
return path
29+
30+
31+
def setup_monitoring(cpplib, interval=10):
32+
if not have_phlop():
33+
return
34+
35+
from phlop.app import stats_man as sm # pylint: disable=import-error
36+
37+
_globals.stats_man = sm.AttachableRuntimeStatsManager(
38+
valdict(yaml=monitoring_yaml_file(cpplib), interval=interval),
39+
dict(rank=cpplib.mpi_rank()),
40+
).start()
41+
42+
43+
def monitoring_shutdown(cpplib):
44+
if not have_phlop():
45+
return
46+
47+
if _globals.stats_man:
48+
_globals.stats_man.kill().join()

pyphare/pyphare/simulator/simulator.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,19 @@
22
#
33
#
44

5+
import os
56
import datetime
67
import atexit
78
import time as timem
89
import numpy as np
910
import pyphare.pharein as ph
11+
from pathlib import Path
12+
from . import monitoring as mon
1013

1114

1215
life_cycles = {}
16+
SIM_MONITOR = os.getenv("PHARE_SIM_MON", "False").lower() in ("true", "1", "t")
17+
SCOPE_TIMING = os.getenv("PHARE_SCOPE_TIMING", "False").lower() in ("true", "1", "t")
1318

1419

1520
@atexit.register
@@ -24,6 +29,9 @@ def simulator_shutdown():
2429
def make_cpp_simulator(dim, interp, nbrRefinedPart, hier):
2530
from pyphare.cpp import cpp_lib
2631

32+
if SCOPE_TIMING:
33+
Path(".phare/timings").mkdir(exist_ok=True)
34+
2735
make_sim = f"make_simulator_{dim}_{interp}_{nbrRefinedPart}"
2836
return getattr(cpp_lib(), make_sim)(hier)
2937

@@ -127,6 +135,7 @@ def initialize(self):
127135

128136
self.cpp_sim.initialize()
129137
self._auto_dump() # first dump might be before first advance
138+
130139
return self
131140
except:
132141
import sys
@@ -140,7 +149,6 @@ def initialize(self):
140149

141150
def _throw(self, e):
142151
import sys
143-
from pyphare.cpp import cpp_lib
144152

145153
print_rank0(e)
146154
sys.exit(1)
@@ -170,12 +178,19 @@ def times(self):
170178
self.timeStep(),
171179
)
172180

173-
def run(self, plot_times=False):
181+
def run(self, plot_times=False, monitoring=None):
182+
"""monitoring requires phlop"""
174183
from pyphare.cpp import cpp_lib
175184

176185
self._check_init()
186+
187+
if monitoring is None: # check env
188+
monitoring = SIM_MONITOR
189+
177190
if self.simulation.dry_run:
178191
return self
192+
if monitoring:
193+
mon.setup_monitoring(cpp_lib())
179194
perf = []
180195
end_time = self.cpp_sim.endTime()
181196
t = self.cpp_sim.currentTime()
@@ -197,6 +212,7 @@ def run(self, plot_times=False):
197212
if plot_times:
198213
plot_timestep_time(perf)
199214

215+
mon.monitoring_shutdown(cpp_lib())
200216
return self.reset()
201217

202218
def _auto_dump(self):
@@ -263,13 +279,10 @@ def _log_to_file(self):
263279
DATETIME_FILES - logfile with starting datetime timestamp per rank
264280
NONE - no logging files, display to cout
265281
"""
266-
import os
267282

268283
if "PHARE_LOG" not in os.environ:
269284
os.environ["PHARE_LOG"] = "RANK_FILES"
270285
from pyphare.cpp import cpp_lib
271286

272287
if os.environ["PHARE_LOG"] != "NONE" and cpp_lib().mpi_rank() == 0:
273-
from pathlib import Path
274-
275288
Path(".log").mkdir(exist_ok=True)

src/amr/solvers/solver_ppc.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ void SolverPPC<HybridModel, AMR_Types>::fillMessengerInfo(
216216
template<typename HybridModel, typename AMR_Types>
217217
void SolverPPC<HybridModel, AMR_Types>::saveState_(level_t& level, ModelViews_t& views)
218218
{
219+
PHARE_LOG_SCOPE(1, "SolverPPC::saveState_");
220+
219221
for (auto& state : views)
220222
{
221223
std::stringstream ss;
@@ -232,6 +234,8 @@ void SolverPPC<HybridModel, AMR_Types>::saveState_(level_t& level, ModelViews_t&
232234
template<typename HybridModel, typename AMR_Types>
233235
void SolverPPC<HybridModel, AMR_Types>::restoreState_(level_t& level, ModelViews_t& views)
234236
{
237+
PHARE_LOG_SCOPE(1, "SolverPPC::restoreState_");
238+
235239
for (auto& state : views)
236240
{
237241
std::stringstream ss;

src/hdf5/detail/h5/h5_file.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class HighFiveFile
5252
fapl.add(HighFive::MPIOFileAccess{MPI_COMM_WORLD, MPI_INFO_NULL});
5353
#else
5454
std::cout << "WARNING: PARALLEL HDF5 not available" << std::endl;
55-
if (core::mpi_size() > 1)
55+
if (core::mpi::size() > 1)
5656
{
5757
throw std::runtime_error("HDF5 NOT PARALLEL!");
5858
}

src/phare/phare.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class SamraiLifeCycle
4343
PHARE_WITH_PHLOP( //
4444
if (auto e = core::get_env("PHARE_SCOPE_TIMING", "false"); e == "1" || e == "true")
4545
phlop::ScopeTimerMan::INSTANCE()
46-
.file_name(".phare_times." + std::to_string(core::mpi::rank()) + ".txt")
46+
.file_name(".phare/timings/rank." + std::to_string(core::mpi::rank()) + ".txt")
4747
.init(); //
4848
)
4949
}

src/simulator/simulator.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ std::string Simulator<_dimension, _interp_order, _nbRefinedPart>::to_str()
367367
template<std::size_t _dimension, std::size_t _interp_order, std::size_t _nbRefinedPart>
368368
void Simulator<_dimension, _interp_order, _nbRefinedPart>::initialize()
369369
{
370+
PHARE_LOG_SCOPE(1, "Simulator::initialize");
371+
370372
try
371373
{
372374
if (isInitialized)
@@ -414,8 +416,6 @@ double Simulator<_dimension, _interp_order, _nbRefinedPart>::advance(double dt)
414416

415417
try
416418
{
417-
PHARE_LOG_SCOPE(1, "Simulator::advance");
418-
419419
dt_new = integrator_->advance(dt);
420420
currentTime_ = startTime_ + ((*timeStamper) += dt);
421421
}

tests/functional/harris/harris_2d_lb.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
mpl.use("Agg")
1717

18+
SCOPE_TIMING = os.getenv("PHARE_SCOPE_TIMING", "True").lower() in ("true", "1", "t")
1819
LOAD_BALANCE = os.getenv("LOAD_BALANCE", "True").lower() in ("true", "1", "t")
1920

2021
cpp = cpp_lib()
@@ -171,12 +172,12 @@ def plot(diag_dir):
171172
for c in ["x", "y", "z"]:
172173
run.GetB(time).plot(
173174
filename=plot_file_for_qty(f"b{c}", time),
174-
qty=f"B{c}",
175+
qty=f"{c}",
175176
plot_patches=True,
176177
)
177178
run.GetJ(time).plot(
178179
filename=plot_file_for_qty("jz", time),
179-
qty="Jz",
180+
qty="z",
180181
plot_patches=True,
181182
vmin=-2,
182183
vmax=2,
@@ -200,7 +201,8 @@ def test_run(self):
200201
Simulator(config()).run().reset()
201202
if cpp.mpi_rank() == 0:
202203
plot(diag_dir)
203-
m_plotting.plot_run_timer_data(diag_dir, cpp.mpi_rank())
204+
if SCOPE_TIMING:
205+
m_plotting.plot_run_timer_data(diag_dir, cpp.mpi_rank())
204206
cpp.mpi_barrier()
205207
return self
206208

tools/python3/phloping.py

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,19 @@
22
# parsing PHARE scope funtion timers
33
#
44

5+
import sys
6+
import argparse
57
import numpy as np
68
from dataclasses import dataclass, field
79

810
from pyphare.pharesee.run import Run
9-
from pyphare.pharesee.hierarchy import hierarchy_from
10-
11-
from phlop.timing.scope_timer import ScopeTimerFile as phScopeTimerFile
12-
from phlop.timing.scope_timer import file_parser as phfile_parser
13-
11+
from phlop.timing import scope_timer as st
1412

1513
substeps_per_finer_level = 4
1614

1715

1816
@dataclass
19-
class ScopeTimerFile(phScopeTimerFile):
17+
class ScopeTimerFile(st.ScopeTimerFile):
2018
run: Run
2119
rank: str
2220
advances: list = field(default_factory=lambda: [])
@@ -124,20 +122,67 @@ def normalised_times_for_L(self, ilvl):
124122
"""
125123
Normalise substep time against particle count for that level
126124
at the most recent coarse time, no refined timesteps
125+
Particle counts may include init dump, so be one bigger.
127126
"""
128127
times = self.advance_times_for_L(ilvl)
128+
counts = len(self.particles_per_level_per_time_step[ilvl])
129+
130+
# trim init particle count for lvl
131+
Li_times = (
132+
self.particles_per_level_per_time_step[ilvl]
133+
if counts == len(times)
134+
else self.particles_per_level_per_time_step[ilvl][1:]
135+
)
129136
if ilvl == 0:
130-
return times / self.particles_per_level_per_time_step[0]
137+
return times / Li_times
131138
substeps = self.steps_per_coarse_timestep_for_L(ilvl)
132139
norm_times = times.copy()
133140
return (
134141
norm_times.reshape(int(times.shape[0] / substeps), substeps)
135-
/ self.particles_per_level_per_time_step[ilvl].reshape(
136-
self.particles_per_level_per_time_step[ilvl].shape[0], 1
137-
)
142+
/ Li_times.reshape(Li_times.shape[0], 1)
138143
).reshape(times.shape[0])
139144

140145

141146
def file_parser(run, rank, times_filepath):
142-
supe = phfile_parser(times_filepath)
147+
supe = st.file_parser(times_filepath)
143148
return ScopeTimerFile(supe.id_keys, supe.roots, run, str(rank))
149+
150+
151+
def write_root_as_csv(scope_timer_file, outfile, headers=None, regex=None):
152+
from contextlib import redirect_stdout
153+
154+
with open(outfile, "w") as f:
155+
with redirect_stdout(f):
156+
print_root_as_csv(scope_timer_file, headers, regex)
157+
158+
159+
def print_root_as_csv(scope_timer_file, n_parts, headers=None, regex=None):
160+
stf = scope_timer_file # alias
161+
stf = file_parser(stf) if isinstance(stf, str) else stf
162+
163+
if headers:
164+
print(",".join(headers))
165+
for root in stf.roots:
166+
s = stf(root.k)
167+
if regex and regex not in s:
168+
continue
169+
bits = s.split(",")
170+
print(f"{s}{root.t},{root.t/n_parts}")
171+
172+
173+
def print_variance_across(scope_timer_filepath=None):
174+
if scope_timer_filepath is None: # assume cli
175+
parser = argparse.ArgumentParser()
176+
parser.add_argument("-f", "--file", default=None, help="timer file")
177+
scope_timer_filepath = parser.parse_args().file
178+
if not scope_timer_filepath:
179+
parser.print_help()
180+
sys.exit(1)
181+
st.print_variance_across(scope_timer_filepath)
182+
183+
184+
if __name__ == "__main__":
185+
if len(sys.argv) > 1:
186+
fn = sys.argv[1]
187+
sys.argv = [sys.argv[0]] + sys.argv[2:]
188+
globals()[fn]()

tools/python3/plotting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def plot_run_timer_data(diag_dir=None, rank=0):
2525
parser.add_argument("-d", "--dir", default=".", help="Diagnostics directory")
2626
diag_dir = parser.parse_args().dir
2727
run = Run(diag_dir)
28-
res = phloping.file_parser(run, rank, Path(f".phare_times.{rank}.txt"))
28+
res = phloping.file_parser(run, rank, Path(f".phare/timings/rank.{rank}.txt"))
2929
fig, ax = plt.subplots()
3030
L0X = res.time_steps_for_L(0)
3131
ax.plot(L0X, res.normalised_times_for_L(0), ":", label="L0 times", color="black")

0 commit comments

Comments
 (0)