Skip to content

Commit 65c58fa

Browse files
norlandrhagentomwhite
authored andcommitted
Adds logging [INFO] to TimelineVisualizationCallback and HistoryCallback (#548)
* added info level logging to HistoryCallback to share directory * add logging to timeline & update mkdir to pathlib style * moved figure creation into timeline on_compute_end(), removed create_timeline(), removed default logging info level from __init__ * removed history & timeline logging * adds more detail to diagnostics.md * Update docs/user-guide/diagnostics.md Co-authored-by: Tom White <tom.e.white@gmail.com> --------- Co-authored-by: Tom White <tom.e.white@gmail.com>
1 parent fd705a5 commit 65c58fa

File tree

2 files changed

+91
-76
lines changed

2 files changed

+91
-76
lines changed

cubed/diagnostics/timeline.py

Lines changed: 59 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import os
21
import time
32
from dataclasses import asdict
3+
from pathlib import Path
44
from typing import Optional
55

66
import matplotlib.patches as mpatches
@@ -16,7 +16,7 @@
1616

1717

1818
class TimelineVisualizationCallback(Callback):
19-
def __init__(self, format: Optional[str] = None) -> None:
19+
def __init__(self, format: Optional[str] = "svg") -> None:
2020
self.format = format
2121

2222
def on_compute_start(self, event):
@@ -27,75 +27,62 @@ def on_task_end(self, event):
2727
self.stats.append(asdict(event))
2828

2929
def on_compute_end(self, event):
30-
end_tstamp = time.time()
31-
dst = f"history/{event.compute_id}"
32-
format = self.format
33-
create_timeline(self.stats, self.start_tstamp, end_tstamp, dst, format)
30+
self.end_tstamp = time.time()
3431

35-
36-
# copy of lithops function of the same name, and modified for different field names
37-
def create_timeline(stats, start_tstamp, end_tstamp, dst=None, format=None):
38-
stats_df = pd.DataFrame(stats)
39-
40-
stats_df = stats_df.sort_values(by=["task_create_tstamp", "name"], ascending=True)
41-
42-
total_calls = len(stats_df)
43-
44-
palette = sns.color_palette("deep", 6)
45-
46-
fig = pylab.figure(figsize=(10, 6))
47-
ax = fig.add_subplot(1, 1, 1)
48-
49-
y = np.arange(total_calls)
50-
point_size = 10
51-
52-
fields = [
53-
("task create", stats_df.task_create_tstamp - start_tstamp),
54-
("function start", stats_df.function_start_tstamp - start_tstamp),
55-
("function end", stats_df.function_end_tstamp - start_tstamp),
56-
("task result", stats_df.task_result_tstamp - start_tstamp),
57-
]
58-
59-
patches = []
60-
for f_i, (field_name, val) in enumerate(fields):
61-
ax.scatter(val, y, c=[palette[f_i]], edgecolor="none", s=point_size, alpha=0.8)
62-
patches.append(mpatches.Patch(color=palette[f_i], label=field_name))
63-
64-
ax.set_xlabel("Execution Time (sec)")
65-
ax.set_ylabel("Function Call")
66-
67-
legend = pylab.legend(handles=patches, loc="upper right", frameon=True)
68-
legend.get_frame().set_facecolor("#FFFFFF")
69-
70-
yplot_step = int(np.max([1, total_calls / 20]))
71-
y_ticks = np.arange(total_calls // yplot_step + 2) * yplot_step
72-
ax.set_yticks(y_ticks)
73-
ax.set_ylim(-0.02 * total_calls, total_calls * 1.02)
74-
for y in y_ticks:
75-
ax.axhline(y, c="k", alpha=0.1, linewidth=1)
76-
77-
max_seconds = np.max(end_tstamp - start_tstamp) * 1.25
78-
xplot_step = max(int(max_seconds / 8), 1)
79-
x_ticks = np.arange(max_seconds // xplot_step + 2) * xplot_step
80-
ax.set_xlim(0, max_seconds)
81-
82-
ax.set_xticks(x_ticks)
83-
for x in x_ticks:
84-
ax.axvline(x, c="k", alpha=0.2, linewidth=0.8)
85-
86-
ax.grid(False)
87-
fig.tight_layout()
88-
89-
if format is None:
90-
format = "svg"
91-
92-
if dst is None:
93-
os.makedirs("plots", exist_ok=True)
94-
dst = os.path.join(
95-
os.getcwd(), "plots", "{}_{}".format(int(time.time()), f"timeline.{format}")
32+
stats_df = pd.DataFrame(self.stats)
33+
stats_df = stats_df.sort_values(
34+
by=["task_create_tstamp", "name"], ascending=True
9635
)
97-
else:
98-
dst = os.path.expanduser(dst) if "~" in dst else dst
99-
dst = "{}/{}".format(os.path.realpath(dst), f"timeline.{format}")
100-
101-
fig.savefig(dst)
36+
total_calls = len(stats_df)
37+
palette = sns.color_palette("deep", 6)
38+
39+
fig = pylab.figure(figsize=(10, 6))
40+
ax = fig.add_subplot(1, 1, 1)
41+
42+
y = np.arange(total_calls)
43+
point_size = 10
44+
45+
fields = [
46+
("task create", stats_df.task_create_tstamp - self.start_tstamp),
47+
("function start", stats_df.function_start_tstamp - self.start_tstamp),
48+
("function end", stats_df.function_end_tstamp - self.start_tstamp),
49+
("task result", stats_df.task_result_tstamp - self.start_tstamp),
50+
]
51+
52+
patches = []
53+
for f_i, (field_name, val) in enumerate(fields):
54+
ax.scatter(
55+
val, y, c=[palette[f_i]], edgecolor="none", s=point_size, alpha=0.8
56+
)
57+
patches.append(mpatches.Patch(color=palette[f_i], label=field_name))
58+
59+
ax.set_xlabel("Execution Time (sec)")
60+
ax.set_ylabel("Function Call")
61+
62+
legend = pylab.legend(handles=patches, loc="upper right", frameon=True)
63+
legend.get_frame().set_facecolor("#FFFFFF")
64+
65+
yplot_step = int(np.max([1, total_calls / 20]))
66+
y_ticks = np.arange(total_calls // yplot_step + 2) * yplot_step
67+
ax.set_yticks(y_ticks)
68+
ax.set_ylim(-0.02 * total_calls, total_calls * 1.02)
69+
for y in y_ticks:
70+
ax.axhline(y, c="k", alpha=0.1, linewidth=1)
71+
72+
max_seconds = np.max(self.end_tstamp - self.start_tstamp) * 1.25
73+
xplot_step = max(int(max_seconds / 8), 1)
74+
x_ticks = np.arange(max_seconds // xplot_step + 2) * xplot_step
75+
ax.set_xlim(0, max_seconds)
76+
77+
ax.set_xticks(x_ticks)
78+
for x in x_ticks:
79+
ax.axvline(x, c="k", alpha=0.2, linewidth=0.8)
80+
81+
ax.grid(False)
82+
fig.tight_layout()
83+
84+
self.dst = Path(f"history/{event.compute_id}")
85+
self.dst.mkdir(parents=True, exist_ok=True)
86+
self.dst = self.dst / f"timeline.{self.format}"
87+
88+
fig.savefig(self.dst)

docs/user-guide/diagnostics.md

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,12 @@ Array `c` is coloured orange, which means it is materialized as a Zarr array. Ar
4141

4242
Similarly, the operation that produces `c` is shown in a lilac colour to signify that it runs tasks to produce the output. Operations `op-001` and `op-002` don't run any tasks since `a` and `b` are just small constant arrays.
4343

44-
## Progress bar
44+
45+
## Callbacks
46+
47+
You can pass callbacks to functions that call `compute`, such as {py:func}`store <cubed.store>` or {py:func}`to_zarr <cubed.to_zarr>`.
48+
49+
### Progress bar
4550

4651
You can display a progress bar to track your computation by passing callbacks to {py:meth}`compute() <cubed.Array.compute()>`:
4752

@@ -53,14 +58,37 @@ You can display a progress bar to track your computation by passing callbacks to
5358
op-003 add 4/4 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% 0:00:00
5459
```
5560

61+
The two current progress bar choice are:
62+
- `from cubed.diagnostics.rich import RichProgressBar`
63+
- `from cubed.diagnostics.tqdm import TqdmProgressBar`
64+
65+
5666
This will work in Jupyter notebooks, and for all executors.
5767

58-
You can also pass callbacks to functions that call `compute`, such as {py:func}`store <cubed.store>` or {py:func}`to_zarr <cubed.to_zarr>`.
5968

60-
## History and timeline visualization
6169

62-
The history and timeline visualization callbacks can be used to find out how long tasks took to run, and how much memory they used.
70+
### History
71+
The history callback can be used to understand how long tasks took to run, and how much memory they used. The history callback will write [`events.csv`, `plan.csv` and `stats.csv`] to a new directory under the current directory with the schema `history/compute-{id}`.
72+
6373

74+
```ipython
75+
>>> from cubed.diagnostics.history import HistoryCallback
76+
>>> hist = HistoryCallback()
77+
>>> c.compute(callbacks=[hist])
78+
```
79+
80+
81+
### Timeline
6482
The timeline visualization is useful to determine how much time was spent in worker startup, as well as how much stragglers affected the overall time of the computation. (Ideally, we want vertical lines on this plot, which would represent perfect horizontal scaling.)
6583

84+
The timeline callback will write a graphic `timeline.svg` to a directory with the schema `history/compute-{id}`.
85+
86+
87+
```ipython
88+
>>> from cubed.diagnostics.timeline import TimelineVisualizationCallback
89+
>>> timeline_viz = TimelineVisualizationCallback()
90+
>>> c.compute(callbacks=[timeline_viz])
91+
```
92+
93+
### Examples in use
6694
See the [examples](https://github.com/cubed-dev/cubed/blob/main/examples/README.md) for more information about how to use them.

0 commit comments

Comments
 (0)