|
| 1 | +#!/usr/bin/env python |
| 2 | +"""This is the minimal example from the README""" |
| 3 | +import json |
| 4 | + |
| 5 | +import numpy |
| 6 | +from kernel_tuner import tune_kernel |
| 7 | +from kernel_tuner.observers.ncu import NCUObserver |
| 8 | + |
| 9 | +def tune(): |
| 10 | + |
| 11 | + kernel_string = """ |
| 12 | + __global__ void vector_add(float *c, float *a, float *b, int n) { |
| 13 | + int i = blockIdx.x * block_size_x + threadIdx.x; |
| 14 | + if (i<n) { |
| 15 | + c[i] = a[i] + b[i]; |
| 16 | + } |
| 17 | + } |
| 18 | + """ |
| 19 | + |
| 20 | + size = 80000000 |
| 21 | + |
| 22 | + a = numpy.random.randn(size).astype(numpy.float32) |
| 23 | + b = numpy.random.randn(size).astype(numpy.float32) |
| 24 | + c = numpy.zeros_like(b) |
| 25 | + n = numpy.int32(size) |
| 26 | + |
| 27 | + args = [c, a, b, n] |
| 28 | + |
| 29 | + tune_params = dict() |
| 30 | + tune_params["block_size_x"] = [128+64*i for i in range(15)] |
| 31 | + |
| 32 | + ncu_metrics = ["dram__bytes.sum", # Counter byte # of bytes accessed in DRAM |
| 33 | + "dram__bytes_read.sum", # Counter byte # of bytes read from DRAM |
| 34 | + "dram__bytes_write.sum", # Counter byte # of bytes written to DRAM |
| 35 | + "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter inst # of FADD thread instructions executed where all predicates were true |
| 36 | + "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter inst # of FFMA thread instructions executed where all predicates were true |
| 37 | + "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter inst # of FMUL thread instructions executed where all predicates were true |
| 38 | + ] |
| 39 | + |
| 40 | + ncuobserver = NCUObserver(metrics=ncu_metrics) |
| 41 | + |
| 42 | + def total_fp32_flops(p): |
| 43 | + return p["smsp__sass_thread_inst_executed_op_fadd_pred_on.sum"] + 2 * p["smsp__sass_thread_inst_executed_op_ffma_pred_on.sum"] + p["smsp__sass_thread_inst_executed_op_fmul_pred_on.sum"] |
| 44 | + |
| 45 | + metrics = dict() |
| 46 | + metrics["GFLOP/s"] = lambda p: (total_fp32_flops(p) / 1e9) / (p["time"]/1e3) |
| 47 | + metrics["Expected GFLOP/s"] = lambda p: (size / 1e9) / (p["time"]/1e3) |
| 48 | + metrics["GB/s"] = lambda p: (p["dram__bytes.sum"] / 1e9) / (p["time"]/1e3) |
| 49 | + metrics["Expected GB/s"] = lambda p: (size*4*3 / 1e9) / (p["time"]/1e3) |
| 50 | + |
| 51 | + results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[ncuobserver], metrics=metrics, iterations=7) |
| 52 | + |
| 53 | + return results |
| 54 | + |
| 55 | + |
| 56 | +if __name__ == "__main__": |
| 57 | + tune() |
0 commit comments