|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Capture autotune min/mid/max from first two generations only. |
| 4 | +Kills the process after seeing Generation 2 data to save time. |
| 5 | +""" |
| 6 | + |
| 7 | +import subprocess |
| 8 | +import re |
| 9 | +import time |
| 10 | +import os |
| 11 | +import sys |
| 12 | +import signal |
| 13 | +from pathlib import Path |
| 14 | +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor |
| 15 | +import threading |
| 16 | +import json |
| 17 | +try: |
| 18 | + import psutil |
| 19 | +except ImportError: |
| 20 | + print("Warning: psutil not installed. CPU affinity detection will be limited.") |
| 21 | + psutil = None |
| 22 | + |
| 23 | +class AutotuneCapture: |
| 24 | + def __init__(self): |
| 25 | + self.initial_pop = None |
| 26 | + self.pattern = re.compile( |
| 27 | + r'(?:Initial population|Initial generation):.*?' |
| 28 | + r'min=([\d.]+)\s+mid=([\d.]+)\s+max=([\d.]+)' |
| 29 | + ) |
| 30 | + |
| 31 | + def parse_line(self, line): |
| 32 | + """Extract initial generation data from a line.""" |
| 33 | + match = self.pattern.search(line) |
| 34 | + if match: |
| 35 | + data = { |
| 36 | + 'min': float(match.group(1)), |
| 37 | + 'mid': float(match.group(2)), |
| 38 | + 'max': float(match.group(3)) |
| 39 | + } |
| 40 | + |
| 41 | + if 'Initial population' in line or 'Initial generation' in line: |
| 42 | + self.initial_pop = data |
| 43 | + return 'initial' |
| 44 | + return None |
| 45 | + |
| 46 | +def monitor_process_output(proc, gpu_id, capture): |
| 47 | + """Monitor process output and kill after initial generation.""" |
| 48 | + print(f"[GPU {gpu_id}] Monitoring autotune output...") |
| 49 | + |
| 50 | + for line in proc.stdout: |
| 51 | + line = line.strip() |
| 52 | + if line: |
| 53 | + # Check for generation data |
| 54 | + result = capture.parse_line(line) |
| 55 | + |
| 56 | + if result == 'initial': |
| 57 | + print(f"[GPU {gpu_id}] Initial generation: min={capture.initial_pop['min']:.4f} " |
| 58 | + f"mid={capture.initial_pop['mid']:.4f} max={capture.initial_pop['max']:.4f}") |
| 59 | + |
| 60 | + # Kill the process after initial generation |
| 61 | + print(f"[GPU {gpu_id}] Got initial generation data, terminating...") |
| 62 | + proc.terminate() |
| 63 | + time.sleep(1) |
| 64 | + if proc.poll() is None: |
| 65 | + proc.kill() |
| 66 | + break |
| 67 | + |
| 68 | + return capture |
| 69 | + |
| 70 | +def get_numa_cpu_affinity(gpu_id, total_gpus=8): |
| 71 | + """Get NUMA node and CPU cores for a GPU.""" |
| 72 | + import psutil |
| 73 | + |
| 74 | + cpu_count = psutil.cpu_count(logical=False) |
| 75 | + cpu_count_logical = psutil.cpu_count(logical=True) |
| 76 | + |
| 77 | + # Simple heuristic: distribute GPUs across available CPUs |
| 78 | + cpus_per_gpu = max(4, cpu_count_logical // total_gpus) |
| 79 | + start_cpu = gpu_id * cpus_per_gpu |
| 80 | + end_cpu = min(start_cpu + cpus_per_gpu - 1, cpu_count_logical - 1) |
| 81 | + |
| 82 | + # Assume 2 NUMA nodes for simplicity |
| 83 | + numa_node = gpu_id // (total_gpus // 2) |
| 84 | + |
| 85 | + return numa_node, f"{start_cpu}-{end_cpu}" |
| 86 | + |
| 87 | +def run_autotune_capture(gpu_id, kernel='gemm', log_file=None, use_isolation=True): |
| 88 | + """Run benchmark and capture initial generation of autotune data.""" |
| 89 | + env = os.environ.copy() |
| 90 | + env['CUDA_VISIBLE_DEVICES'] = str(gpu_id) |
| 91 | + |
| 92 | + # Enable Helion autotune logging |
| 93 | + env['HELION_AUTOTUNE_LOG_LEVEL'] = '10' # DEBUG level |
| 94 | + |
| 95 | + # Set thread limits to prevent oversubscription |
| 96 | + numa_node, cpu_list = get_numa_cpu_affinity(gpu_id) |
| 97 | + num_cpus = len(range(int(cpu_list.split('-')[0]), int(cpu_list.split('-')[1]) + 1)) |
| 98 | + env['OMP_NUM_THREADS'] = str(num_cpus) |
| 99 | + env['MKL_NUM_THREADS'] = str(num_cpus) |
| 100 | + |
| 101 | + cmd = [] |
| 102 | + |
| 103 | + # Add isolation commands if requested |
| 104 | + if use_isolation: |
| 105 | + # CPU pinning with taskset |
| 106 | + cmd.extend(['taskset', '-c', cpu_list]) |
| 107 | + |
| 108 | + # NUMA binding with numactl |
| 109 | + cmd.extend(['numactl', f'--cpunodebind={numa_node}', f'--membind={numa_node}']) |
| 110 | + |
| 111 | + # Python command |
| 112 | + cmd.extend([ |
| 113 | + sys.executable, |
| 114 | + 'benchmarks/run.py', |
| 115 | + '--kernel', kernel, |
| 116 | + '--num-inputs', '1' |
| 117 | + ]) |
| 118 | + |
| 119 | + if use_isolation: |
| 120 | + print(f"[GPU {gpu_id}] Starting autotune capture with isolation:") |
| 121 | + print(f" NUMA node: {numa_node}, CPU cores: {cpu_list}") |
| 122 | + else: |
| 123 | + print(f"[GPU {gpu_id}] Starting autotune capture...") |
| 124 | + |
| 125 | + capture = AutotuneCapture() |
| 126 | + |
| 127 | + # Start process |
| 128 | + proc = subprocess.Popen( |
| 129 | + cmd, |
| 130 | + env=env, |
| 131 | + stdout=subprocess.PIPE, |
| 132 | + stderr=subprocess.STDOUT, |
| 133 | + text=True, |
| 134 | + bufsize=1 # Line buffered |
| 135 | + ) |
| 136 | + |
| 137 | + # Monitor output |
| 138 | + try: |
| 139 | + monitor_process_output(proc, gpu_id, capture) |
| 140 | + except Exception as e: |
| 141 | + print(f"[GPU {gpu_id}] Error: {e}") |
| 142 | + proc.kill() |
| 143 | + |
| 144 | + # Save to log file if specified |
| 145 | + if log_file and capture.initial_pop: |
| 146 | + with open(log_file, 'w') as f: |
| 147 | + f.write(f"GPU {gpu_id} Autotune Results\n") |
| 148 | + f.write("="*40 + "\n") |
| 149 | + f.write(f"Initial: min={capture.initial_pop['min']:.4f} " |
| 150 | + f"mid={capture.initial_pop['mid']:.4f} " |
| 151 | + f"max={capture.initial_pop['max']:.4f}\n") |
| 152 | + |
| 153 | + return capture |
| 154 | + |
| 155 | +def compare_single_vs_concurrent(kernel='gemm', num_gpus=4, use_isolation=True): |
| 156 | + """Compare autotune results between single GPU and concurrent execution.""" |
| 157 | + results_dir = Path('autotune_comparison') |
| 158 | + results_dir.mkdir(exist_ok=True) |
| 159 | + |
| 160 | + print("AUTOTUNE INITIAL GENERATION COMPARISON") |
| 161 | + print("="*60) |
| 162 | + print(f"Kernel: {kernel}") |
| 163 | + print(f"Capturing: Initial population/generation only") |
| 164 | + |
| 165 | + # Step 1: Single GPU baseline |
| 166 | + print(f"\nStep 1: Single GPU baseline (GPU 0)") |
| 167 | + print("-"*40) |
| 168 | + |
| 169 | + baseline_log = results_dir / 'baseline_gpu0.txt' |
| 170 | + baseline = run_autotune_capture(0, kernel, baseline_log, use_isolation=False) # Single GPU doesn't need isolation |
| 171 | + |
| 172 | + if not baseline.initial_pop: |
| 173 | + print("ERROR: Failed to capture initial generation for baseline") |
| 174 | + return |
| 175 | + |
| 176 | + # Cool down |
| 177 | + print("\nCooling down for 30s...") |
| 178 | + time.sleep(30) |
| 179 | + |
| 180 | + # Step 2: Concurrent execution |
| 181 | + isolation_msg = "with process isolation" if use_isolation else "without isolation" |
| 182 | + print(f"\nStep 2: Concurrent execution on {num_gpus} GPUs {isolation_msg}") |
| 183 | + print("-"*40) |
| 184 | + |
| 185 | + concurrent_results = {} |
| 186 | + |
| 187 | + with ProcessPoolExecutor(max_workers=num_gpus) as executor: |
| 188 | + futures = { |
| 189 | + executor.submit( |
| 190 | + run_autotune_capture, |
| 191 | + gpu_id, |
| 192 | + kernel, |
| 193 | + results_dir / f'concurrent_gpu{gpu_id}.txt', |
| 194 | + use_isolation |
| 195 | + ): gpu_id |
| 196 | + for gpu_id in range(num_gpus) |
| 197 | + } |
| 198 | + |
| 199 | + for future in futures: |
| 200 | + gpu_id = futures[future] |
| 201 | + try: |
| 202 | + result = future.result() |
| 203 | + concurrent_results[gpu_id] = result |
| 204 | + except Exception as e: |
| 205 | + print(f"[GPU {gpu_id}] Failed: {e}") |
| 206 | + |
| 207 | + # Cool down |
| 208 | + print("\nCooling down for 30s...") |
| 209 | + time.sleep(30) |
| 210 | + |
| 211 | + # Analysis |
| 212 | + print("\n" + "="*60) |
| 213 | + print("RESULTS COMPARISON") |
| 214 | + print("="*60) |
| 215 | + |
| 216 | + print("\nBaseline (Single GPU):") |
| 217 | + print(f" Initial: min={baseline.initial_pop['min']:.4f} " |
| 218 | + f"mid={baseline.initial_pop['mid']:.4f} " |
| 219 | + f"max={baseline.initial_pop['max']:.4f}") |
| 220 | + |
| 221 | + print("\nConcurrent GPUs:") |
| 222 | + |
| 223 | + # Collect all concurrent min times |
| 224 | + concurrent_mins = [] |
| 225 | + for gpu_id in sorted(concurrent_results.keys()): |
| 226 | + result = concurrent_results[gpu_id] |
| 227 | + if result.initial_pop: |
| 228 | + concurrent_mins.append(result.initial_pop['min']) |
| 229 | + print(f" GPU {gpu_id} Initial: min={result.initial_pop['min']:.4f} " |
| 230 | + f"mid={result.initial_pop['mid']:.4f} " |
| 231 | + f"max={result.initial_pop['max']:.4f}") |
| 232 | + |
| 233 | + if concurrent_mins: |
| 234 | + # Compare min times |
| 235 | + baseline_min = baseline.initial_pop['min'] |
| 236 | + avg_concurrent_min = sum(concurrent_mins) / len(concurrent_mins) |
| 237 | + |
| 238 | + degradation = ((avg_concurrent_min - baseline_min) / baseline_min) * 100 |
| 239 | + |
| 240 | + print(f"\nInitial Generation Min Time Comparison:") |
| 241 | + print(f" Baseline: {baseline_min:.4f}") |
| 242 | + print(f" Concurrent avg: {avg_concurrent_min:.4f}") |
| 243 | + print(f" Degradation: {degradation:+.1f}%") |
| 244 | + |
| 245 | + if degradation > 5: |
| 246 | + print(f"\n⚠️ SIGNIFICANT CONTENTION DETECTED!") |
| 247 | + print(f" Concurrent autotuning shows {degradation:.1f}% worse min times") |
| 248 | + print(f" This indicates resource contention is affecting autotune quality") |
| 249 | + elif degradation > 2: |
| 250 | + print(f"\n⚡ MODERATE CONTENTION ({degradation:.1f}% degradation)") |
| 251 | + else: |
| 252 | + print(f"\n✅ MINIMAL CONTENTION ({degradation:.1f}% degradation)") |
| 253 | + |
| 254 | + # Save summary |
| 255 | + summary = { |
| 256 | + 'kernel': kernel, |
| 257 | + 'baseline': { |
| 258 | + 'initial': baseline.initial_pop |
| 259 | + }, |
| 260 | + 'concurrent': { |
| 261 | + f'gpu_{gpu_id}': { |
| 262 | + 'initial': result.initial_pop |
| 263 | + } |
| 264 | + for gpu_id, result in concurrent_results.items() |
| 265 | + if result.initial_pop |
| 266 | + } |
| 267 | + } |
| 268 | + |
| 269 | + import json |
| 270 | + with open(results_dir / 'summary.json', 'w') as f: |
| 271 | + json.dump(summary, f, indent=2) |
| 272 | + |
| 273 | + print(f"\nDetailed results saved to {results_dir}/") |
| 274 | + |
| 275 | +def main(): |
| 276 | + import argparse |
| 277 | + parser = argparse.ArgumentParser(description='Capture autotune generations for contention analysis') |
| 278 | + parser.add_argument('--kernel', default='gemm', help='Kernel to test') |
| 279 | + parser.add_argument('--num-gpus', type=int, default=4, help='Number of GPUs for concurrent test') |
| 280 | + parser.add_argument('--no-isolation', action='store_true', help='Disable process isolation for concurrent runs') |
| 281 | + |
| 282 | + args = parser.parse_args() |
| 283 | + |
| 284 | + compare_single_vs_concurrent(args.kernel, args.num_gpus, use_isolation=not args.no_isolation) |
| 285 | + |
| 286 | +if __name__ == '__main__': |
| 287 | + main() |
0 commit comments