Skip to content

Commit 5422973

Browse files
committed
multi-gpu concurrent autotune exp
1 parent 7e97913 commit 5422973

File tree

2 files changed

+444
-0
lines changed

2 files changed

+444
-0
lines changed

capture_autotune_generations.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Capture autotune min/mid/max from first two generations only.
4+
Kills the process after seeing Generation 2 data to save time.
5+
"""
6+
7+
import subprocess
8+
import re
9+
import time
10+
import os
11+
import sys
12+
import signal
13+
from pathlib import Path
14+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
15+
import threading
16+
import json
17+
try:
18+
import psutil
19+
except ImportError:
20+
print("Warning: psutil not installed. CPU affinity detection will be limited.")
21+
psutil = None
22+
23+
class AutotuneCapture:
24+
def __init__(self):
25+
self.initial_pop = None
26+
self.pattern = re.compile(
27+
r'(?:Initial population|Initial generation):.*?'
28+
r'min=([\d.]+)\s+mid=([\d.]+)\s+max=([\d.]+)'
29+
)
30+
31+
def parse_line(self, line):
32+
"""Extract initial generation data from a line."""
33+
match = self.pattern.search(line)
34+
if match:
35+
data = {
36+
'min': float(match.group(1)),
37+
'mid': float(match.group(2)),
38+
'max': float(match.group(3))
39+
}
40+
41+
if 'Initial population' in line or 'Initial generation' in line:
42+
self.initial_pop = data
43+
return 'initial'
44+
return None
45+
46+
def monitor_process_output(proc, gpu_id, capture):
47+
"""Monitor process output and kill after initial generation."""
48+
print(f"[GPU {gpu_id}] Monitoring autotune output...")
49+
50+
for line in proc.stdout:
51+
line = line.strip()
52+
if line:
53+
# Check for generation data
54+
result = capture.parse_line(line)
55+
56+
if result == 'initial':
57+
print(f"[GPU {gpu_id}] Initial generation: min={capture.initial_pop['min']:.4f} "
58+
f"mid={capture.initial_pop['mid']:.4f} max={capture.initial_pop['max']:.4f}")
59+
60+
# Kill the process after initial generation
61+
print(f"[GPU {gpu_id}] Got initial generation data, terminating...")
62+
proc.terminate()
63+
time.sleep(1)
64+
if proc.poll() is None:
65+
proc.kill()
66+
break
67+
68+
return capture
69+
70+
def get_numa_cpu_affinity(gpu_id, total_gpus=8):
71+
"""Get NUMA node and CPU cores for a GPU."""
72+
import psutil
73+
74+
cpu_count = psutil.cpu_count(logical=False)
75+
cpu_count_logical = psutil.cpu_count(logical=True)
76+
77+
# Simple heuristic: distribute GPUs across available CPUs
78+
cpus_per_gpu = max(4, cpu_count_logical // total_gpus)
79+
start_cpu = gpu_id * cpus_per_gpu
80+
end_cpu = min(start_cpu + cpus_per_gpu - 1, cpu_count_logical - 1)
81+
82+
# Assume 2 NUMA nodes for simplicity
83+
numa_node = gpu_id // (total_gpus // 2)
84+
85+
return numa_node, f"{start_cpu}-{end_cpu}"
86+
87+
def run_autotune_capture(gpu_id, kernel='gemm', log_file=None, use_isolation=True):
88+
"""Run benchmark and capture initial generation of autotune data."""
89+
env = os.environ.copy()
90+
env['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
91+
92+
# Enable Helion autotune logging
93+
env['HELION_AUTOTUNE_LOG_LEVEL'] = '10' # DEBUG level
94+
95+
# Set thread limits to prevent oversubscription
96+
numa_node, cpu_list = get_numa_cpu_affinity(gpu_id)
97+
num_cpus = len(range(int(cpu_list.split('-')[0]), int(cpu_list.split('-')[1]) + 1))
98+
env['OMP_NUM_THREADS'] = str(num_cpus)
99+
env['MKL_NUM_THREADS'] = str(num_cpus)
100+
101+
cmd = []
102+
103+
# Add isolation commands if requested
104+
if use_isolation:
105+
# CPU pinning with taskset
106+
cmd.extend(['taskset', '-c', cpu_list])
107+
108+
# NUMA binding with numactl
109+
cmd.extend(['numactl', f'--cpunodebind={numa_node}', f'--membind={numa_node}'])
110+
111+
# Python command
112+
cmd.extend([
113+
sys.executable,
114+
'benchmarks/run.py',
115+
'--kernel', kernel,
116+
'--num-inputs', '1'
117+
])
118+
119+
if use_isolation:
120+
print(f"[GPU {gpu_id}] Starting autotune capture with isolation:")
121+
print(f" NUMA node: {numa_node}, CPU cores: {cpu_list}")
122+
else:
123+
print(f"[GPU {gpu_id}] Starting autotune capture...")
124+
125+
capture = AutotuneCapture()
126+
127+
# Start process
128+
proc = subprocess.Popen(
129+
cmd,
130+
env=env,
131+
stdout=subprocess.PIPE,
132+
stderr=subprocess.STDOUT,
133+
text=True,
134+
bufsize=1 # Line buffered
135+
)
136+
137+
# Monitor output
138+
try:
139+
monitor_process_output(proc, gpu_id, capture)
140+
except Exception as e:
141+
print(f"[GPU {gpu_id}] Error: {e}")
142+
proc.kill()
143+
144+
# Save to log file if specified
145+
if log_file and capture.initial_pop:
146+
with open(log_file, 'w') as f:
147+
f.write(f"GPU {gpu_id} Autotune Results\n")
148+
f.write("="*40 + "\n")
149+
f.write(f"Initial: min={capture.initial_pop['min']:.4f} "
150+
f"mid={capture.initial_pop['mid']:.4f} "
151+
f"max={capture.initial_pop['max']:.4f}\n")
152+
153+
return capture
154+
155+
def compare_single_vs_concurrent(kernel='gemm', num_gpus=4, use_isolation=True):
156+
"""Compare autotune results between single GPU and concurrent execution."""
157+
results_dir = Path('autotune_comparison')
158+
results_dir.mkdir(exist_ok=True)
159+
160+
print("AUTOTUNE INITIAL GENERATION COMPARISON")
161+
print("="*60)
162+
print(f"Kernel: {kernel}")
163+
print(f"Capturing: Initial population/generation only")
164+
165+
# Step 1: Single GPU baseline
166+
print(f"\nStep 1: Single GPU baseline (GPU 0)")
167+
print("-"*40)
168+
169+
baseline_log = results_dir / 'baseline_gpu0.txt'
170+
baseline = run_autotune_capture(0, kernel, baseline_log, use_isolation=False) # Single GPU doesn't need isolation
171+
172+
if not baseline.initial_pop:
173+
print("ERROR: Failed to capture initial generation for baseline")
174+
return
175+
176+
# Cool down
177+
print("\nCooling down for 30s...")
178+
time.sleep(30)
179+
180+
# Step 2: Concurrent execution
181+
isolation_msg = "with process isolation" if use_isolation else "without isolation"
182+
print(f"\nStep 2: Concurrent execution on {num_gpus} GPUs {isolation_msg}")
183+
print("-"*40)
184+
185+
concurrent_results = {}
186+
187+
with ProcessPoolExecutor(max_workers=num_gpus) as executor:
188+
futures = {
189+
executor.submit(
190+
run_autotune_capture,
191+
gpu_id,
192+
kernel,
193+
results_dir / f'concurrent_gpu{gpu_id}.txt',
194+
use_isolation
195+
): gpu_id
196+
for gpu_id in range(num_gpus)
197+
}
198+
199+
for future in futures:
200+
gpu_id = futures[future]
201+
try:
202+
result = future.result()
203+
concurrent_results[gpu_id] = result
204+
except Exception as e:
205+
print(f"[GPU {gpu_id}] Failed: {e}")
206+
207+
# Cool down
208+
print("\nCooling down for 30s...")
209+
time.sleep(30)
210+
211+
# Analysis
212+
print("\n" + "="*60)
213+
print("RESULTS COMPARISON")
214+
print("="*60)
215+
216+
print("\nBaseline (Single GPU):")
217+
print(f" Initial: min={baseline.initial_pop['min']:.4f} "
218+
f"mid={baseline.initial_pop['mid']:.4f} "
219+
f"max={baseline.initial_pop['max']:.4f}")
220+
221+
print("\nConcurrent GPUs:")
222+
223+
# Collect all concurrent min times
224+
concurrent_mins = []
225+
for gpu_id in sorted(concurrent_results.keys()):
226+
result = concurrent_results[gpu_id]
227+
if result.initial_pop:
228+
concurrent_mins.append(result.initial_pop['min'])
229+
print(f" GPU {gpu_id} Initial: min={result.initial_pop['min']:.4f} "
230+
f"mid={result.initial_pop['mid']:.4f} "
231+
f"max={result.initial_pop['max']:.4f}")
232+
233+
if concurrent_mins:
234+
# Compare min times
235+
baseline_min = baseline.initial_pop['min']
236+
avg_concurrent_min = sum(concurrent_mins) / len(concurrent_mins)
237+
238+
degradation = ((avg_concurrent_min - baseline_min) / baseline_min) * 100
239+
240+
print(f"\nInitial Generation Min Time Comparison:")
241+
print(f" Baseline: {baseline_min:.4f}")
242+
print(f" Concurrent avg: {avg_concurrent_min:.4f}")
243+
print(f" Degradation: {degradation:+.1f}%")
244+
245+
if degradation > 5:
246+
print(f"\n⚠️ SIGNIFICANT CONTENTION DETECTED!")
247+
print(f" Concurrent autotuning shows {degradation:.1f}% worse min times")
248+
print(f" This indicates resource contention is affecting autotune quality")
249+
elif degradation > 2:
250+
print(f"\n⚡ MODERATE CONTENTION ({degradation:.1f}% degradation)")
251+
else:
252+
print(f"\n✅ MINIMAL CONTENTION ({degradation:.1f}% degradation)")
253+
254+
# Save summary
255+
summary = {
256+
'kernel': kernel,
257+
'baseline': {
258+
'initial': baseline.initial_pop
259+
},
260+
'concurrent': {
261+
f'gpu_{gpu_id}': {
262+
'initial': result.initial_pop
263+
}
264+
for gpu_id, result in concurrent_results.items()
265+
if result.initial_pop
266+
}
267+
}
268+
269+
import json
270+
with open(results_dir / 'summary.json', 'w') as f:
271+
json.dump(summary, f, indent=2)
272+
273+
print(f"\nDetailed results saved to {results_dir}/")
274+
275+
def main():
276+
import argparse
277+
parser = argparse.ArgumentParser(description='Capture autotune generations for contention analysis')
278+
parser.add_argument('--kernel', default='gemm', help='Kernel to test')
279+
parser.add_argument('--num-gpus', type=int, default=4, help='Number of GPUs for concurrent test')
280+
parser.add_argument('--no-isolation', action='store_true', help='Disable process isolation for concurrent runs')
281+
282+
args = parser.parse_args()
283+
284+
compare_single_vs_concurrent(args.kernel, args.num_gpus, use_isolation=not args.no_isolation)
285+
286+
if __name__ == '__main__':
287+
main()

0 commit comments

Comments
 (0)