Skip to content

Commit 1ef99e4

Browse files
committed
experiment README and scripts
1 parent ca52ead commit 1ef99e4

File tree

8 files changed

+516
-81
lines changed

8 files changed

+516
-81
lines changed

Makefile

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
11
OUTPUT_DIR=./bin
22
BINARY_NAME=simon
33
LINUX_BINARY_NAME=simon_linux
4+
DARWIN_BINARY_NAME=simon_darwin
45

5-
.PHONY: local_build
6-
local_build:
7-
GOARCH=amd64 GOOS=darwin go build -v -o $(OUTPUT_DIR)/$(BINARY_NAME) ./cmd
6+
.PHONY: darwin_build
7+
darwin_build:
8+
GOARCH=amd64 GOOS=darwin go build -v -o $(OUTPUT_DIR)/$(DARWIN_BINARY_NAME) ./cmd
9+
chmod +x $(OUTPUT_DIR)/$(DARWIN_BINARY_NAME)
10+
rm -f $(OUTPUT_DIR)/$(BINARY_NAME)
11+
ln -s $(OUTPUT_DIR)/$(DARWIN_BINARY_NAME) $(OUTPUT_DIR)/$(BINARY_NAME)
812

913
.PHONY: linux_build
1014
linux_build:
1115
GOARCH=amd64 GOOS=linux go build -v -o $(OUTPUT_DIR)/$(LINUX_BINARY_NAME) ./cmd
12-
chmod +x $(OUTPUT_DIR)/$(LINUX_BINARY_NAME)
16+
chmod +x $(OUTPUT_DIR)/$(LINUX_BINARY_NAME)
17+
rm -f $(OUTPUT_DIR)/$(BINARY_NAME)
18+
ln -s $(OUTPUT_DIR)/$(LINUX_BINARY_NAME) $(OUTPUT_DIR)/$(BINARY_NAME)
19+
20+
.PHONY: clean_build
21+
clean_build:
22+
rm -f $(OUTPUT_DIR)/*

experiments/README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
## Steps
2+
3+
### Generate Run Scripts
4+
```bash
5+
cd kubernetes-scheduler-simulator/experiments
6+
python run_scripts/generate_run_scripts.py > run_scripts/run_scripts_0511.sh
7+
```
8+
9+
Script parameters:
10+
- Date:
11+
- FileList:
12+
- PARALLEL: num vCPU / 2 (suggested)
13+
- NUM_REPEAT: repetited experiments (10 in paper)
14+
15+
### Execute
16+
```bash
17+
PARALLEL=16 # for example
18+
cat run_scripts_0511.sh | while read i; do printf "%q\n" "$i"; done | xargs --max-procs=${PARALLEL} -I CMD bash -c CMD
19+
# bash run_scripts_0511.sh will run experiments sequentially
20+
```
21+
22+
~~1 month later...~~
23+
- 1 hour for 1 experiment on 2 vCPU.
24+
- 16 hours for 1020 experiments on a 256 vCPU machine with PARALLEL=128.
25+
26+
### Merge
27+
```bash
28+
# pwd: kubernetes-scheduler-simulator/experiments
29+
ln -s 2023_0511 data # softlink it to data
30+
cd analysis
31+
bash merge_bash.sh
32+
```
33+
34+
### Plot
35+
```bash
36+
# pwd: kubernetes-scheduler-simulator/experiments
37+
ln -s analysis/analysis_results/* plot/ # softlink all csv under analysis_results/ to plot/
38+
cd plot
39+
python plot_paib_alloc.py # Fig. 9(a)
40+
python plot_paib_frag_amount.py # Fig. 7(a)
41+
python plot_paib_frag_ratio.py # Fig. 7(b)
42+
python plot_paib_gpushare_alloc_bar.py # Fig. 11
43+
python plot_paib_multigpu_alloc_bar.py # Fig. 12
44+
python plot_paib_gpuspec_alloc_bar.py # Fig. 13
45+
python plot_paib_nongpu_alloc_bar.py # Fig. 14
46+
```
47+

experiments/plot/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.pdf
2+
*.csv
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
2+
# %%
3+
# 1016
4+
import matplotlib
5+
from matplotlib import style
6+
import pandas as pd
7+
import seaborn as sns
8+
import matplotlib.pyplot as plt
9+
from pathlib import Path
10+
from IPython.display import display
11+
from utils import parse_workload_name, POLICY_ABBR_DICT
12+
13+
PAPER_PLOT=False # False: Plot with thinner lines for DingTalk or Doc usages
14+
SAVEFIG=False # False: plt.show()
15+
TUNE_RATIO = 1.3
16+
FIGNAME = "paib_gpuspec_alloc_bar.pdf"
17+
18+
matplotlib.rcdefaults()
19+
matplotlib.rcParams['pdf.fonttype'] = 42
20+
if PAPER_PLOT:
21+
matplotlib.rcParams.update({"font.size": 24}) # for 24 for (8,6), 16 for (4,3)
22+
matplotlib.rcParams['lines.linewidth'] = 4 # 2.5
23+
else:
24+
matplotlib.rcParams.update({"font.size": 14}) # for 24 for (8,6), 16 for (4,3)
25+
matplotlib.rcParams['lines.linewidth'] = 2 # 2.5
26+
27+
FILEDICT = {'alloc': "analysis_allo_discrete.csv",
28+
'frag_amount': "analysis_frag_discrete.csv",
29+
'frag_ratio': "analysis_frag_ratio_discrete.csv"}
30+
31+
dfp_dict = {}
32+
for type, file in FILEDICT.items():
33+
if not Path(file).exists():
34+
print("file %s not found" % type)
35+
continue
36+
dfn = pd.read_csv(file)
37+
dfn['real_time'] = dfn['workload'].apply(lambda x: True if 'real_time' in x else False)
38+
dfn = dfn[~dfn.real_time]
39+
dfn.drop(columns=['real_time'], inplace=True)
40+
41+
dfn['workload'] = dfn.workload.apply(parse_workload_name)
42+
43+
workload_order = ['EightGpu80', 'EightGpu60', 'FourGpu80', 'FourGpu60', 'TwoGpu80', 'TwoGpu60', 'OneGpu80', 'OneGpu60', 'ShareGpu80', 'ShareGpu60', 'hhpai_0820','hhpai_0905','hhpai_mvap_0820', 'hhpai_mvap_0905', 'mit', 'mvap', 'paib']
44+
workload_order_dict = dict(zip(workload_order, range(1, len(workload_order)+1)))
45+
dfn.workload = dfn.workload.apply(lambda x: x if x not in workload_order_dict else "%02d-%s" % (workload_order_dict[x], x))
46+
47+
# display(dfn)
48+
# print("SC_POLICY_LIST=[%s]" % (",".join("'%s'" % x for x in list(dfn.sc_policy.unique()))))
49+
50+
dfn13 = dfn[dfn.tune == TUNE_RATIO].copy()
51+
cols = list(dfn13.columns)
52+
for col in ['workload','sc_policy','tune','seed','total_gpus']:
53+
if col in cols:
54+
cols.remove(col)
55+
56+
if type == 'alloc':
57+
dfnp = pd.melt(dfn13, id_vars=['workload','sc_policy','seed'], value_vars=cols,
58+
var_name='arrive_rate', value_name="alloc_rate")
59+
dfnp['alloc_rate_reverse'] = 100 - dfnp["alloc_rate"]
60+
61+
else: # 'frag_amount', 'frag_ratio'
62+
dfnp = pd.melt(dfn13, id_vars=['workload','sc_policy','seed'], value_vars=cols,
63+
var_name='arrive_rate', value_name="frag_rate")
64+
65+
dfnp.arrive_rate = dfnp.arrive_rate.apply(int)
66+
dfnp.sc_policy = dfnp.sc_policy.apply(lambda x: POLICY_ABBR_DICT.get(x, x))
67+
dfp_dict[type] = dfnp
68+
69+
# openb, production workloads:
70+
# workload = 'cluster_openb-pod_openb-0820_gpu_nospec'
71+
72+
# openb, multi-GPUs workloads
73+
# workload = 'cluster_openb-pod_openb-0820_a20aoc_gpu_nospec'
74+
# workload = 'cluster_openb-pod_openb-0820_a30aoc_gpu_nospec'
75+
# workload = 'cluster_openb-pod_openb-0820_a40aoc_gpu_nospec'
76+
# workload = 'cluster_openb-pod_openb-0820_a50aoc_gpu_nospec'
77+
# workloads = ['cluster_openb-pod_openb-0820_a20aoc_gpu_nospec','cluster_openb-pod_openb-0820_a30aoc_gpu_nospec','cluster_openb-pod_openb-0820_a40aoc_gpu_nospec','cluster_openb-pod_openb-0820_a50aoc_gpu_nospec']
78+
79+
# openb, non-GPU workloads
80+
# workload = 'cluster_openb-pod_openb-0820_nocpu_gpu_nospec'
81+
82+
# openb, heterogeneous GPU types
83+
workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_05'
84+
workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_10'
85+
workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_20'
86+
workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_25'
87+
workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec'
88+
workloads = ['cluster_openb-pod_openb-0820_gpu_gpuspec_05',
89+
'cluster_openb-pod_openb-0820_gpu_gpuspec_10',
90+
'cluster_openb-pod_openb-0820_gpu_gpuspec_20',
91+
'cluster_openb-pod_openb-0820_gpu_gpuspec_25',
92+
'cluster_openb-pod_openb-0820_gpu_gpuspec']
93+
94+
# mvap, GPU-sharing workloads
95+
# workload = 'cluster_mvap-pod_mvap-0820_nomem_no_time-cap_1.0'
96+
97+
# policy_keep = ['FGD', 'Clustering', 'Packing', 'DotProd', 'BestFit', 'Random']
98+
policy_keep = ['FGD', 'BestFit', 'Packing', 'Clustering', 'DotProd', 'Random']
99+
# policy_keep = ['BestFit', 'FGD Coarse', 'FGD']
100+
101+
# ['alloc', 'frag_amount', 'frag_ratio']
102+
dfnp = dfp_dict['alloc']
103+
104+
yhead = 30
105+
dfnpp = dfnp[dfnp.workload.isin(workloads)][dfnp.arrive_rate==100].copy()
106+
dfnpp.workload = dfnpp.workload.apply(lambda x:
107+
{
108+
'cluster_openb-pod_openb-0820_gpu_gpuspec_05': '5%',
109+
'cluster_openb-pod_openb-0820_gpu_gpuspec_10': '10%',
110+
'cluster_openb-pod_openb-0820_gpu_gpuspec_20': '20%',
111+
'cluster_openb-pod_openb-0820_gpu_gpuspec_25': '25%',
112+
'cluster_openb-pod_openb-0820_gpu_gpuspec': '33%',
113+
}.get(x, x))
114+
dfnpp = dfnpp[dfnpp.sc_policy.isin(policy_keep)]
115+
plt.figure(figsize=(10, 3), dpi=120)
116+
bars = sns.barplot(data=dfnpp, x='workload', y='alloc_rate_reverse', hue='sc_policy', errorbar='sd', hue_order=policy_keep, order=['10%','20%','25%','33%'], edgecolor="0")
117+
# for i, container in enumerate(ax.containers):
118+
# ax.bar_label(container, label_type='edge', fmt='%0.1f%%', padding=10)
119+
hatches = [ "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ]
120+
num_policy = len(policy_keep)
121+
num_groups = len(bars.patches) // num_policy
122+
for i, bar in enumerate(bars.patches):
123+
hatch_i = (i) // num_groups
124+
hatch_m = hatches[hatch_i % len(hatches)]
125+
bar.set_hatch(hatch_m)
126+
bars.bar_label(bars.containers[0], label_type='edge', fmt='%0.1f%%', padding=5)
127+
128+
plt.xlabel('Percentage of GPUs occupied by workloads with GPU type constraints')
129+
plt.ylabel('Unallocated GPU (%)')
130+
131+
plt.legend()
132+
# plt.xlabel('Arrived Workload (in Percentage of Cluster GPU Capacity)')
133+
# plt.ylabel('Unallocated GPU (%)')
134+
# plt.xlim(100-yhead, None)
135+
plt.ylim(0, 21.7)
136+
# plt.title("%s" % (workload))
137+
# plt.show()
138+
139+
plt.grid(linestyle='-.', alpha=0.8, axis='y')
140+
# plt.legend(ncol=3, loc='upper right', bbox_to_anchor=(0.665, 1.03))
141+
plt.legend(ncol=3, loc='upper left')
142+
plt.xlabel('Proportion of workloads with GPU type constraints in terms of GPU requests')
143+
144+
SAVEFIG=True
145+
if SAVEFIG:
146+
plt.savefig(FIGNAME, bbox_inches='tight')
147+
else:
148+
plt.show()
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
2+
# %%
3+
# 1016
4+
import matplotlib
5+
from matplotlib import style
6+
import pandas as pd
7+
import seaborn as sns
8+
import matplotlib.pyplot as plt
9+
from pathlib import Path
10+
from IPython.display import display
11+
from utils import parse_workload_name, POLICY_ABBR_DICT
12+
13+
PAPER_PLOT=False # False: Plot with thinner lines for DingTalk or Doc usages
14+
SAVEFIG=False # False: plt.show()
15+
TUNE_RATIO = 1.3
16+
FIGNAME = "paib_multigpu_alloc_bar.pdf"
17+
18+
matplotlib.rcdefaults()
19+
matplotlib.rcParams['pdf.fonttype'] = 42
20+
if PAPER_PLOT:
21+
matplotlib.rcParams.update({"font.size": 24}) # for 24 for (8,6), 16 for (4,3)
22+
matplotlib.rcParams['lines.linewidth'] = 4 # 2.5
23+
else:
24+
matplotlib.rcParams.update({"font.size": 14}) # for 24 for (8,6), 16 for (4,3)
25+
matplotlib.rcParams['lines.linewidth'] = 2 # 2.5
26+
27+
FILEDICT = {'alloc': "analysis_allo_discrete.csv",
28+
'frag_amount': "analysis_frag_discrete.csv",
29+
'frag_ratio': "analysis_frag_ratio_discrete.csv"}
30+
31+
dfp_dict = {}
32+
for type, file in FILEDICT.items():
33+
if not Path(file).exists():
34+
print("file %s not found" % type)
35+
continue
36+
dfn = pd.read_csv(file)
37+
dfn['real_time'] = dfn['workload'].apply(lambda x: True if 'real_time' in x else False)
38+
dfn = dfn[~dfn.real_time]
39+
dfn.drop(columns=['real_time'], inplace=True)
40+
41+
dfn['workload'] = dfn.workload.apply(parse_workload_name)
42+
43+
workload_order = ['EightGpu80', 'EightGpu60', 'FourGpu80', 'FourGpu60', 'TwoGpu80', 'TwoGpu60', 'OneGpu80', 'OneGpu60', 'ShareGpu80', 'ShareGpu60', 'hhpai_0820','hhpai_0905','hhpai_mvap_0820', 'hhpai_mvap_0905', 'mit', 'mvap', 'paib']
44+
workload_order_dict = dict(zip(workload_order, range(1, len(workload_order)+1)))
45+
dfn.workload = dfn.workload.apply(lambda x: x if x not in workload_order_dict else "%02d-%s" % (workload_order_dict[x], x))
46+
47+
# display(dfn)
48+
# print("SC_POLICY_LIST=[%s]" % (",".join("'%s'" % x for x in list(dfn.sc_policy.unique()))))
49+
50+
dfn13 = dfn[dfn.tune == TUNE_RATIO].copy()
51+
cols = list(dfn13.columns)
52+
for col in ['workload','sc_policy','tune','seed','total_gpus']:
53+
if col in cols:
54+
cols.remove(col)
55+
56+
if type == 'alloc':
57+
dfnp = pd.melt(dfn13, id_vars=['workload','sc_policy','seed'], value_vars=cols,
58+
var_name='arrive_rate', value_name="alloc_rate")
59+
dfnp['alloc_rate_reverse'] = 100 - dfnp["alloc_rate"]
60+
61+
else: # 'frag_amount', 'frag_ratio'
62+
dfnp = pd.melt(dfn13, id_vars=['workload','sc_policy','seed'], value_vars=cols,
63+
var_name='arrive_rate', value_name="frag_rate")
64+
65+
dfnp.arrive_rate = dfnp.arrive_rate.apply(int)
66+
dfnp.sc_policy = dfnp.sc_policy.apply(lambda x: POLICY_ABBR_DICT.get(x, x))
67+
dfp_dict[type] = dfnp
68+
69+
# openb, production workloads:
70+
# workload = 'cluster_openb-pod_openb-0820_gpu_nospec'
71+
72+
# openb, multi-GPUs workloads
73+
# workload = 'cluster_openb-pod_openb-0820_a20aoc_gpu_nospec'
74+
# workload = 'cluster_openb-pod_openb-0820_a30aoc_gpu_nospec'
75+
# workload = 'cluster_openb-pod_openb-0820_a40aoc_gpu_nospec'
76+
# workload = 'cluster_openb-pod_openb-0820_a50aoc_gpu_nospec'
77+
workloads = ['cluster_openb-pod_openb-0820_a20aoc_gpu_nospec',
78+
'cluster_openb-pod_openb-0820_a30aoc_gpu_nospec',
79+
'cluster_openb-pod_openb-0820_a40aoc_gpu_nospec',
80+
'cluster_openb-pod_openb-0820_a50aoc_gpu_nospec']
81+
82+
# openb, non-GPU workloads
83+
# workload = 'cluster_openb-pod_openb-0820_nocpu_gpu_nospec'
84+
85+
# openb, heterogeneous GPU types
86+
# workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_05'
87+
# workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_10'
88+
# workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_20'
89+
# workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec_25'
90+
# workload = 'cluster_openb-pod_openb-0820_gpu_gpuspec'
91+
92+
# mvap, GPU-sharing workloads
93+
# workload = 'cluster_mvap-pod_mvap-0820_nomem_no_time-cap_1.0'
94+
95+
# policy_keep = ['FGD', 'Packing', 'Clustering', 'DotProd', 'BestFit', 'Random']
96+
policy_keep = ['FGD', 'BestFit', 'Packing', 'Clustering', 'DotProd', 'Random']
97+
# policy_keep = ['BestFit', 'FGD Coarse', 'FGD']
98+
99+
# ['alloc', 'frag_amount', 'frag_ratio']
100+
dfnp = dfp_dict['alloc']
101+
102+
yhead = 30
103+
dfnpp = dfnp[dfnp.workload.isin(workloads)][dfnp.arrive_rate==100].copy()
104+
print(dfnpp[dfnpp.workload == workloads[3]].groupby(by='sc_policy').mean())
105+
dfnpp.workload = dfnpp.workload.apply(lambda x:
106+
{
107+
'cluster_openb-pod_openb-0820_a20aoc_gpu_nospec': '20%',
108+
'cluster_openb-pod_openb-0820_a30aoc_gpu_nospec': '30%',
109+
'cluster_openb-pod_openb-0820_a40aoc_gpu_nospec': '40%',
110+
'cluster_openb-pod_openb-0820_a50aoc_gpu_nospec': '50%',
111+
}.get(x, x))
112+
dfnpp = dfnpp[dfnpp.sc_policy.isin(policy_keep)]
113+
plt.figure(figsize=(10, 3), dpi=120)
114+
bars = sns.barplot(data=dfnpp, x='workload', y='alloc_rate_reverse', hue='sc_policy', errorbar='sd', hue_order=policy_keep, edgecolor="0")
115+
hatches = [ "/" , "\\" , "|" , "-" , "+" , "x", "o", "O", ".", "*" ]
116+
num_policy = len(policy_keep)
117+
num_groups = len(bars.patches) // num_policy
118+
for i, bar in enumerate(bars.patches):
119+
hatch_i = (i) // num_groups
120+
hatch_m = hatches[hatch_i % len(hatches)]
121+
bar.set_hatch(hatch_m)
122+
123+
# for i, container in enumerate(ax.containers):
124+
# ax.bar_label(container, label_type='edge', fmt='%0.1f%%', padding=10)
125+
bars.bar_label(bars.containers[0], label_type='edge', fmt='%0.1f%%', padding=10)
126+
127+
plt.xlabel('Percentage of GPUs occupied by multi-GPU workloads')
128+
plt.ylabel('Unallocated GPU (%)')
129+
130+
plt.legend()
131+
# plt.xlabel('Arrived Workload (in Percentage of Cluster GPU Capacity)')
132+
# plt.ylabel('Unallocated GPU (%)')
133+
# plt.xlim(100-yhead, None)
134+
plt.ylim(0, 21.7)
135+
# plt.title("%s" % (workload))
136+
# plt.show()
137+
138+
plt.grid(linestyle='-.', alpha=0.8, axis='y')
139+
# plt.legend(ncol=3, loc='upper right', bbox_to_anchor=(0.665, 1.03))
140+
plt.legend(ncol=3, loc='upper left')
141+
plt.xlabel('Proportion of multi-GPU workloads in terms of GPU requests')
142+
143+
SAVEFIG=True
144+
if SAVEFIG:
145+
plt.savefig(FIGNAME, bbox_inches='tight')
146+
else:
147+
plt.show()

0 commit comments

Comments
 (0)