|
6 | 6 | #include <linux/cpu.h>
|
7 | 7 | #include <linux/cpumask.h>
|
8 | 8 | #include <linux/jump_label.h>
|
| 9 | +#include <linux/kthread.h> |
9 | 10 | #include <linux/mm.h>
|
10 | 11 | #include <linux/smp.h>
|
11 | 12 | #include <linux/types.h>
|
12 | 13 | #include <asm/cpufeature.h>
|
13 | 14 | #include <asm/hwprobe.h>
|
| 15 | +#include <asm/vector.h> |
14 | 16 |
|
15 | 17 | #include "copy-unaligned.h"
|
16 | 18 |
|
@@ -268,12 +270,147 @@ static int check_unaligned_access_speed_all_cpus(void)
|
268 | 270 | }
|
269 | 271 | #endif
|
270 | 272 |
|
| 273 | +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS |
| 274 | +static void check_vector_unaligned_access(struct work_struct *work __always_unused) |
| 275 | +{ |
| 276 | + int cpu = smp_processor_id(); |
| 277 | + u64 start_cycles, end_cycles; |
| 278 | + u64 word_cycles; |
| 279 | + u64 byte_cycles; |
| 280 | + int ratio; |
| 281 | + unsigned long start_jiffies, now; |
| 282 | + struct page *page; |
| 283 | + void *dst; |
| 284 | + void *src; |
| 285 | + long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; |
| 286 | + |
| 287 | + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) |
| 288 | + return; |
| 289 | + |
| 290 | + page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); |
| 291 | + if (!page) { |
| 292 | + pr_warn("Allocation failure, not measuring vector misaligned performance\n"); |
| 293 | + return; |
| 294 | + } |
| 295 | + |
| 296 | + /* Make an unaligned destination buffer. */ |
| 297 | + dst = (void *)((unsigned long)page_address(page) | 0x1); |
| 298 | + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ |
| 299 | + src = dst + (MISALIGNED_BUFFER_SIZE / 2); |
| 300 | + src += 2; |
| 301 | + word_cycles = -1ULL; |
| 302 | + |
| 303 | + /* Do a warmup. */ |
| 304 | + kernel_vector_begin(); |
| 305 | + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); |
| 306 | + |
| 307 | + start_jiffies = jiffies; |
| 308 | + while ((now = jiffies) == start_jiffies) |
| 309 | + cpu_relax(); |
| 310 | + |
| 311 | + /* |
| 312 | + * For a fixed amount of time, repeatedly try the function, and take |
| 313 | + * the best time in cycles as the measurement. |
| 314 | + */ |
| 315 | + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { |
| 316 | + start_cycles = get_cycles64(); |
| 317 | + /* Ensure the CSR read can't reorder WRT to the copy. */ |
| 318 | + mb(); |
| 319 | + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); |
| 320 | + /* Ensure the copy ends before the end time is snapped. */ |
| 321 | + mb(); |
| 322 | + end_cycles = get_cycles64(); |
| 323 | + if ((end_cycles - start_cycles) < word_cycles) |
| 324 | + word_cycles = end_cycles - start_cycles; |
| 325 | + } |
| 326 | + |
| 327 | + byte_cycles = -1ULL; |
| 328 | + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); |
| 329 | + start_jiffies = jiffies; |
| 330 | + while ((now = jiffies) == start_jiffies) |
| 331 | + cpu_relax(); |
| 332 | + |
| 333 | + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { |
| 334 | + start_cycles = get_cycles64(); |
| 335 | + /* Ensure the CSR read can't reorder WRT to the copy. */ |
| 336 | + mb(); |
| 337 | + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); |
| 338 | + /* Ensure the copy ends before the end time is snapped. */ |
| 339 | + mb(); |
| 340 | + end_cycles = get_cycles64(); |
| 341 | + if ((end_cycles - start_cycles) < byte_cycles) |
| 342 | + byte_cycles = end_cycles - start_cycles; |
| 343 | + } |
| 344 | + |
| 345 | + kernel_vector_end(); |
| 346 | + |
| 347 | + /* Don't divide by zero. */ |
| 348 | + if (!word_cycles || !byte_cycles) { |
| 349 | + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", |
| 350 | + cpu); |
| 351 | + |
| 352 | + return; |
| 353 | + } |
| 354 | + |
| 355 | + if (word_cycles < byte_cycles) |
| 356 | + speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; |
| 357 | + |
| 358 | + ratio = div_u64((byte_cycles * 100), word_cycles); |
| 359 | + pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", |
| 360 | + cpu, |
| 361 | + ratio / 100, |
| 362 | + ratio % 100, |
| 363 | + (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); |
| 364 | + |
| 365 | + per_cpu(vector_misaligned_access, cpu) = speed; |
| 366 | +} |
| 367 | + |
| 368 | +static int riscv_online_cpu_vec(unsigned int cpu) |
| 369 | +{ |
| 370 | + if (!has_vector()) |
| 371 | + return 0; |
| 372 | + |
| 373 | + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) |
| 374 | + return 0; |
| 375 | + |
| 376 | + check_vector_unaligned_access_emulated(NULL); |
| 377 | + check_vector_unaligned_access(NULL); |
| 378 | + return 0; |
| 379 | +} |
| 380 | + |
| 381 | +/* Measure unaligned access speed on all CPUs present at boot in parallel. */ |
| 382 | +static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) |
| 383 | +{ |
| 384 | + schedule_on_each_cpu(check_vector_unaligned_access); |
| 385 | + |
| 386 | + /* |
| 387 | + * Setup hotplug callbacks for any new CPUs that come online or go |
| 388 | + * offline. |
| 389 | + */ |
| 390 | + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", |
| 391 | + riscv_online_cpu_vec, NULL); |
| 392 | + |
| 393 | + return 0; |
| 394 | +} |
| 395 | +#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ |
| 396 | +static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) |
| 397 | +{ |
| 398 | + return 0; |
| 399 | +} |
| 400 | +#endif |
| 401 | + |
271 | 402 | static int check_unaligned_access_all_cpus(void)
|
272 | 403 | {
|
273 |
| - bool all_cpus_emulated; |
| 404 | + bool all_cpus_emulated, all_cpus_vec_unsupported; |
274 | 405 |
|
275 | 406 | all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
|
276 |
| - check_vector_unaligned_access_emulated_all_cpus(); |
| 407 | + all_cpus_vec_unsupported = check_vector_unaligned_access_emulated_all_cpus(); |
| 408 | + |
| 409 | + if (!all_cpus_vec_unsupported && |
| 410 | + IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { |
| 411 | + kthread_run(vec_check_unaligned_access_speed_all_cpus, |
| 412 | + NULL, "vec_check_unaligned_access_speed_all_cpus"); |
| 413 | + } |
277 | 414 |
|
278 | 415 | if (!all_cpus_emulated)
|
279 | 416 | return check_unaligned_access_speed_all_cpus();
|
|
0 commit comments