KernelTuner
diff --git a/‎.github/workflows/test-python-package.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/test-python-package.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎CONTRIBUTING.rst
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎doc/requirements.txt
Lines changed: 1 addition & 1 deletion b/‎doc/requirements.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernel_tuner/backends/cupy.py
Lines changed: 5 additions & 0 deletions b/‎kernel_tuner/backends/cupy.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎kernel_tuner/backends/hip.py
Lines changed: 1 addition & 1 deletion b/‎kernel_tuner/backends/hip.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernel_tuner/backends/nvcuda.py
Lines changed: 14 additions & 5 deletions b/‎kernel_tuner/backends/nvcuda.py
Lines changed: 14 additions & 5 deletions
diff --git a/‎kernel_tuner/backends/pycuda.py
Lines changed: 2 additions & 0 deletions b/‎kernel_tuner/backends/pycuda.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎kernel_tuner/core.py
Lines changed: 16 additions & 8 deletions b/‎kernel_tuner/core.py
Lines changed: 16 additions & 8 deletions
diff --git a/‎kernel_tuner/observers/hip.py
Lines changed: 1 addition & 1 deletion b/‎kernel_tuner/observers/hip.py
Lines changed: 1 addition & 1 deletion
@@ -17,11 +17,11 @@ on:
 jobs:
     build:
         name: Test on ${{ matrix.os }} with all supported Python versions
-        runs-on: ${{ format('{0}-latest', matrix.os)  }} # "-latest" is added here so we can use OS in the format expected by CodeCov
+        runs-on: ${{ format('{0}', matrix.os)  }} # "-latest" is added here so we can use OS in the format expected by CodeCov
 
         strategy:
             matrix:
-                os: [ubuntu, macos]
+                os: [ubuntu-latest, macos-13]
 
         steps:
             - uses: actions/checkout@v4
 
@@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster):
              - /path/to/directory
     * [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation).
     * Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory.
-    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`.
+    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. 
     * [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`.
 #. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer).
 #. Activate the virtual environment: :bash:`conda activate kerneltuner`.
 
@@ -1,7 +1,7 @@
 
 
 <div align="center">
-  <img width="500px" src="doc/images/KernelTuner-logo.png"/>
+  <img width="500px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/KernelTuner-logo.png"/>
 </div>
 
 ---
@@ -98,11 +98,11 @@ More [examples here](https://kerneltuner.github.io/kernel_tuner/stable/examples.
 
 ## Kernel Tuner ecosystem
 
-<img width="250px" src="doc/images/kernel_launcher.png"/><br />C++ magic to integrate auto-tuned kernels into C++ applications 
+<img width="250px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/kernel_launcher.png"/><br />C++ magic to integrate auto-tuned kernels into C++ applications 
 
-<img width="250px" src="doc/images/kernel_float.png"/><br />C++ data types for mixed-precision CUDA kernel programming
+<img width="250px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/kernel_float.png"/><br />C++ data types for mixed-precision CUDA kernel programming
 
-<img width="275px" src="doc/images/kernel_dashboard.png"/><br />Monitor, analyze, and visualize auto-tuning runs
+<img width="275px" src="https://raw.githubusercontent.com/KernelTuner/kernel_tuner/master/doc/images/kernel_dashboard.png"/><br />Monitor, analyze, and visualize auto-tuning runs
 
 
 ## Communication & Contribution
 
@@ -18,7 +18,7 @@ domdf-python-tools==3.8.0.post2 ; python_version >= "3.8" and python_version < "
 exceptiongroup==1.2.0 ; python_version >= "3.8" and python_version < "3.11"
 executing==2.0.1 ; python_version >= "3.8" and python_version < "3.12"
 fastjsonschema==2.19.1 ; python_version >= "3.8" and python_version < "3.12"
-idna==3.6 ; python_version >= "3.8" and python_version < "3.12"
+idna==3.7 ; python_version >= "3.8" and python_version < "3.12"
 imagesize==1.4.1 ; python_version >= "3.8" and python_version < "3.12"
 importlib-metadata==7.0.1 ; python_version >= "3.8" and python_version < "3.10"
 importlib-resources==6.1.1 ; python_version >= "3.8" and python_version < "3.9"
 
@@ -1,5 +1,6 @@
 """This module contains all Cupy specific kernel_tuner functions."""
 from __future__ import print_function
+from warnings import warn
 
 import numpy as np
 
@@ -124,6 +125,7 @@ def compile(self, kernel_instance):
         compiler_options = self.compiler_options
         if not any(["-std=" in opt for opt in self.compiler_options]):
             compiler_options = ["--std=c++11"] + self.compiler_options
+        # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145
 
         options = tuple(compiler_options)
 
@@ -132,6 +134,7 @@ def compile(self, kernel_instance):
         )
 
         self.func = self.current_module.get_function(kernel_name)
+        self.num_regs = self.func.num_regs
         return self.func
 
     def start_event(self):
@@ -197,6 +200,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
             of the grid
         :type grid: tuple(int, int)
         """
+        if stream is None:
+            stream = self.stream
         func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size)
 
     def memset(self, allocation, value, size):
 
@@ -11,7 +11,7 @@
 
 try:
     from pyhip import hip, hiprtc
-except ImportError:
+except (ImportError, RuntimeError):
     hip = None
     hiprtc = None
 
 
@@ -1,9 +1,11 @@
 """This module contains all NVIDIA cuda-python specific kernel_tuner functions."""
+from warnings import warn
+
 import numpy as np
 
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
-from kernel_tuner.util import SkippableFailure, cuda_error_check
+from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc
 
 # embedded in try block to be able to generate documentation
 # and run tests without cuda-python installed
@@ -161,12 +163,12 @@ def compile(self, kernel_instance):
             compiler_options.append(b"--std=c++11")
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
-        if not any([b"--gpu-architecture=" in opt for opt in compiler_options]):
+        if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
             compiler_options.append(
-                f"--gpu-architecture=compute_{self.cc}".encode("UTF-8")
+                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
             )
-        if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]):
-            self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}")
+        if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
+            self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
         err, program = nvrtc.nvrtcCreateProgram(
             str.encode(kernel_string), b"CUDAProgram", 0, [], []
@@ -192,6 +194,11 @@ def compile(self, kernel_instance):
             )
             cuda_error_check(err)
 
+            # get the number of registers per thread used in this kernel
+            num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func)
+            assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}"
+            self.num_regs = num_regs[1]
+
         except RuntimeError as re:
             _, n = nvrtc.nvrtcGetProgramLogSize(program)
             log = b" " * n
@@ -273,6 +280,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
             of the grid
         :type grid: tuple(int, int)
         """
+        if stream is None:
+            stream = self.stream
         arg_types = list()
         for arg in gpu_args:
             if isinstance(arg, cuda.CUdeviceptr):
 
@@ -218,6 +218,8 @@ def compile(self, kernel_instance):
             )
 
             self.func = self.current_module.get_function(kernel_name)
+            if not isinstance(self.func, str):
+                self.num_regs = self.func.num_regs
             return self.func
         except drv.CompileError as e:
             if "uses too much shared data" in e.stderr:
 
@@ -341,7 +341,7 @@ def __init__(
             print("Using: " + self.dev.name)
 
     def benchmark_default(self, func, gpu_args, threads, grid, result):
-        """Benchmark one kernel execution at a time"""
+        """Benchmark one kernel execution at a time."""
         observers = [
             obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
         ]
@@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
         for obs in self.continuous_observers:
             result.update(obs.get_results())
 
-    def benchmark(self, func, gpu_args, instance, verbose, objective):
-        """benchmark the kernel instance"""
-        logging.debug("benchmark " + instance.name)
-        logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
-        logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)
-
+    def set_nvml_parameters(self, instance):
+        """Set the NVML parameters. Avoids setting time leaking into benchmark time."""
         if self.use_nvml:
             if "nvml_pwr_limit" in instance.params:
                 new_limit = int(
@@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
             if "nvml_mem_clock" in instance.params:
                 self.nvml.mem_clock = instance.params["nvml_mem_clock"]
 
+    def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
+        """Benchmark the kernel instance."""
+        logging.debug("benchmark " + instance.name)
+        logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
+        logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)
+
+        if self.use_nvml and not skip_nvml_setting:
+            self.set_nvml_parameters(instance)
+
         # Call the observers to register the configuration to be benchmarked
         for obs in self.dev.observers:
             obs.register_configuration(instance.params)
@@ -577,9 +582,12 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
 
                 # benchmark
                 if func:
+                    # setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead
+                    if self.use_nvml:
+                        self.set_nvml_parameters(instance)
                     start_benchmark = time.perf_counter()
                     result.update(
-                        self.benchmark(func, gpu_args, instance, verbose, to.objective)
+                        self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False)
                     )
                     last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)
 
 
@@ -4,7 +4,7 @@
 
 try:
     from pyhip import hip, hiprtc
-except ImportError:
+except (ImportError, RuntimeError):
     hip = None
     hiprtc = None
Original file line number	Diff line number	Diff line change
`@@ -218,6 +218,8 @@ def compile(self, kernel_instance):`
`218`	`218`	`)`
`219`	`219`
`220`	`220`	`self.func = self.current_module.get_function(kernel_name)`
	`221`	`+ if not isinstance(self.func, str):`
	`222`	`+ self.num_regs = self.func.num_regs`
`221`	`223`	`return self.func`
`222`	`224`	`except drv.CompileError as e:`
`223`	`225`	`if "uses too much shared data" in e.stderr:`