Skip to content

Commit e776731

Browse files
committed
Merge branch 'master' into directives
2 parents 70ab9c6 + dae9e8e commit e776731

File tree

19 files changed

+244
-211
lines changed

19 files changed

+244
-211
lines changed

.github/workflows/test-python-package.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ on:
1717
jobs:
1818
build:
1919
name: Test on ${{ matrix.os }} with all supported Python versions
20-
runs-on: ${{ format('{0}-latest', matrix.os) }} # "-latest" is added here so we can use OS in the format expected by CodeCov
20+
runs-on: ${{ format('{0}', matrix.os) }} # "-latest" is added here so we can use OS in the format expected by CodeCov
2121

2222
strategy:
2323
matrix:
24-
os: [ubuntu, macos]
24+
os: [ubuntu-latest, macos-13]
2525

2626
steps:
2727
- uses: actions/checkout@v4

CONTRIBUTING.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster):
8585
- /path/to/directory
8686
* [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation).
8787
* Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory.
88-
* [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`.
88+
* [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable.
8989
* [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`.
9090
#. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer).
9191
#. Activate the virtual environment: :bash:`conda activate kerneltuner`.

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22

33
<div align="center">
4-
<img width="500px" src="doc/images/KernelTuner-logo.png"/>
4+
<img width="500px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/KernelTuner-logo.png"/>
55
</div>
66

77
---
@@ -98,11 +98,11 @@ More [examples here](https://kerneltuner.github.io/kernel_tuner/stable/examples.
9898

9999
## Kernel Tuner ecosystem
100100

101-
<img width="250px" src="doc/images/kernel_launcher.png"/><br />C++ magic to integrate auto-tuned kernels into C++ applications
101+
<img width="250px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/kernel_launcher.png"/><br />C++ magic to integrate auto-tuned kernels into C++ applications
102102

103-
<img width="250px" src="doc/images/kernel_float.png"/><br />C++ data types for mixed-precision CUDA kernel programming
103+
<img width="250px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/kernel_float.png"/><br />C++ data types for mixed-precision CUDA kernel programming
104104

105-
<img width="275px" src="doc/images/kernel_dashboard.png"/><br />Monitor, analyze, and visualize auto-tuning runs
105+
<img width="275px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/kernel_dashboard.png"/><br />Monitor, analyze, and visualize auto-tuning runs
106106

107107

108108
## Communication & Contribution

doc/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ domdf-python-tools==3.8.0.post2 ; python_version >= "3.8" and python_version < "
1818
exceptiongroup==1.2.0 ; python_version >= "3.8" and python_version < "3.11"
1919
executing==2.0.1 ; python_version >= "3.8" and python_version < "3.12"
2020
fastjsonschema==2.19.1 ; python_version >= "3.8" and python_version < "3.12"
21-
idna==3.6 ; python_version >= "3.8" and python_version < "3.12"
21+
idna==3.7 ; python_version >= "3.8" and python_version < "3.12"
2222
imagesize==1.4.1 ; python_version >= "3.8" and python_version < "3.12"
2323
importlib-metadata==7.0.1 ; python_version >= "3.8" and python_version < "3.10"
2424
importlib-resources==6.1.1 ; python_version >= "3.8" and python_version < "3.9"

kernel_tuner/backends/cupy.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""This module contains all Cupy specific kernel_tuner functions."""
22
from __future__ import print_function
3+
from warnings import warn
34

45
import numpy as np
56

@@ -124,6 +125,7 @@ def compile(self, kernel_instance):
124125
compiler_options = self.compiler_options
125126
if not any(["-std=" in opt for opt in self.compiler_options]):
126127
compiler_options = ["--std=c++11"] + self.compiler_options
128+
# CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145
127129

128130
options = tuple(compiler_options)
129131

@@ -132,6 +134,7 @@ def compile(self, kernel_instance):
132134
)
133135

134136
self.func = self.current_module.get_function(kernel_name)
137+
self.num_regs = self.func.num_regs
135138
return self.func
136139

137140
def start_event(self):
@@ -197,6 +200,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
197200
of the grid
198201
:type grid: tuple(int, int)
199202
"""
203+
if stream is None:
204+
stream = self.stream
200205
func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size)
201206

202207
def memset(self, allocation, value, size):

kernel_tuner/backends/hip.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
try:
1313
from pyhip import hip, hiprtc
14-
except ImportError:
14+
except (ImportError, RuntimeError):
1515
hip = None
1616
hiprtc = None
1717

kernel_tuner/backends/nvcuda.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""This module contains all NVIDIA cuda-python specific kernel_tuner functions."""
2+
from warnings import warn
3+
24
import numpy as np
35

46
from kernel_tuner.backends.backend import GPUBackend
57
from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
6-
from kernel_tuner.util import SkippableFailure, cuda_error_check
8+
from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc
79

810
# embedded in try block to be able to generate documentation
911
# and run tests without cuda-python installed
@@ -161,12 +163,12 @@ def compile(self, kernel_instance):
161163
compiler_options.append(b"--std=c++11")
162164
if not any(["--std=" in opt for opt in self.compiler_options]):
163165
self.compiler_options.append("--std=c++11")
164-
if not any([b"--gpu-architecture=" in opt for opt in compiler_options]):
166+
if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
165167
compiler_options.append(
166-
f"--gpu-architecture=compute_{self.cc}".encode("UTF-8")
168+
f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
167169
)
168-
if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]):
169-
self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}")
170+
if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
171+
self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
170172

171173
err, program = nvrtc.nvrtcCreateProgram(
172174
str.encode(kernel_string), b"CUDAProgram", 0, [], []
@@ -192,6 +194,11 @@ def compile(self, kernel_instance):
192194
)
193195
cuda_error_check(err)
194196

197+
# get the number of registers per thread used in this kernel
198+
num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func)
199+
assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}"
200+
self.num_regs = num_regs[1]
201+
195202
except RuntimeError as re:
196203
_, n = nvrtc.nvrtcGetProgramLogSize(program)
197204
log = b" " * n
@@ -273,6 +280,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
273280
of the grid
274281
:type grid: tuple(int, int)
275282
"""
283+
if stream is None:
284+
stream = self.stream
276285
arg_types = list()
277286
for arg in gpu_args:
278287
if isinstance(arg, cuda.CUdeviceptr):

kernel_tuner/backends/pycuda.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,8 @@ def compile(self, kernel_instance):
218218
)
219219

220220
self.func = self.current_module.get_function(kernel_name)
221+
if not isinstance(self.func, str):
222+
self.num_regs = self.func.num_regs
221223
return self.func
222224
except drv.CompileError as e:
223225
if "uses too much shared data" in e.stderr:

kernel_tuner/core.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ def __init__(
341341
print("Using: " + self.dev.name)
342342

343343
def benchmark_default(self, func, gpu_args, threads, grid, result):
344-
"""Benchmark one kernel execution at a time"""
344+
"""Benchmark one kernel execution at a time."""
345345
observers = [
346346
obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
347347
]
@@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
391391
for obs in self.continuous_observers:
392392
result.update(obs.get_results())
393393

394-
def benchmark(self, func, gpu_args, instance, verbose, objective):
395-
"""benchmark the kernel instance"""
396-
logging.debug("benchmark " + instance.name)
397-
logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
398-
logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)
399-
394+
def set_nvml_parameters(self, instance):
395+
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
400396
if self.use_nvml:
401397
if "nvml_pwr_limit" in instance.params:
402398
new_limit = int(
@@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
409405
if "nvml_mem_clock" in instance.params:
410406
self.nvml.mem_clock = instance.params["nvml_mem_clock"]
411407

408+
def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
409+
"""Benchmark the kernel instance."""
410+
logging.debug("benchmark " + instance.name)
411+
logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
412+
logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)
413+
414+
if self.use_nvml and not skip_nvml_setting:
415+
self.set_nvml_parameters(instance)
416+
412417
# Call the observers to register the configuration to be benchmarked
413418
for obs in self.dev.observers:
414419
obs.register_configuration(instance.params)
@@ -577,9 +582,12 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
577582

578583
# benchmark
579584
if func:
585+
# setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead
586+
if self.use_nvml:
587+
self.set_nvml_parameters(instance)
580588
start_benchmark = time.perf_counter()
581589
result.update(
582-
self.benchmark(func, gpu_args, instance, verbose, to.objective)
590+
self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False)
583591
)
584592
last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)
585593

kernel_tuner/observers/hip.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
try:
66
from pyhip import hip, hiprtc
7-
except ImportError:
7+
except (ImportError, RuntimeError):
88
hip = None
99
hiprtc = None
1010

0 commit comments

Comments
 (0)