Skip to content

Commit c34f480

Browse files
sayantnAmanieu
authored andcommitted
Fix the stream intrinsics
They should use a platform-specific address management.
1 parent f81a1f8 commit c34f480

File tree

9 files changed

+82
-59
lines changed

9 files changed

+82
-59
lines changed

crates/core_arch/src/x86/avx.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1738,8 +1738,8 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
17381738
#[stable(feature = "simd_x86", since = "1.27.0")]
17391739
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
17401740
crate::arch::asm!(
1741-
"vmovntdq [{mem_addr}], {a}",
1742-
mem_addr = in(reg) mem_addr,
1741+
vps!("vmovntdq", ",{a}"),
1742+
p = in(reg) mem_addr,
17431743
a = in(ymm_reg) a,
17441744
options(nostack, preserves_flags),
17451745
);
@@ -1766,8 +1766,8 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
17661766
#[allow(clippy::cast_ptr_alignment)]
17671767
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
17681768
crate::arch::asm!(
1769-
"vmovntpd [{mem_addr}], {a}",
1770-
mem_addr = in(reg) mem_addr,
1769+
vps!("vmovntpd", ",{a}"),
1770+
p = in(reg) mem_addr,
17711771
a = in(ymm_reg) a,
17721772
options(nostack, preserves_flags),
17731773
);
@@ -1795,8 +1795,8 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
17951795
#[allow(clippy::cast_ptr_alignment)]
17961796
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
17971797
crate::arch::asm!(
1798-
"vmovntps [{mem_addr}], {a}",
1799-
mem_addr = in(reg) mem_addr,
1798+
vps!("vmovntps", ",{a}"),
1799+
p = in(reg) mem_addr,
18001800
a = in(ymm_reg) a,
18011801
options(nostack, preserves_flags),
18021802
);

crates/core_arch/src/x86/avx2.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3149,9 +3149,9 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
31493149
pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
31503150
let dst: __m256i;
31513151
crate::arch::asm!(
3152-
"vmovntdqa {a}, [{mem_addr}]",
3152+
vpl!("vmovntdqa {a}"),
31533153
a = out(ymm_reg) dst,
3154-
mem_addr = in(reg) mem_addr,
3154+
p = in(reg) mem_addr,
31553155
options(pure, readonly, nostack, preserves_flags),
31563156
);
31573157
dst

crates/core_arch/src/x86/avx512bw.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ use crate::{
88
#[cfg(test)]
99
use stdarch_test::assert_instr;
1010

11-
use super::avx512f::{vpl, vps};
12-
1311
/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
1412
///
1513
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)

crates/core_arch/src/x86/avx512f.rs

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,37 +6,6 @@ use crate::{
66
mem, ptr,
77
};
88

9-
// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
10-
// register name (e.g. rax). We have to explicitly override the placeholder to
11-
// use the 32-bit register name in that case.
12-
13-
#[cfg(target_pointer_width = "32")]
14-
macro_rules! vpl {
15-
($inst:expr) => {
16-
concat!($inst, ", [{p:e}]")
17-
};
18-
}
19-
#[cfg(target_pointer_width = "64")]
20-
macro_rules! vpl {
21-
($inst:expr) => {
22-
concat!($inst, ", [{p}]")
23-
};
24-
}
25-
#[cfg(target_pointer_width = "32")]
26-
macro_rules! vps {
27-
($inst1:expr, $inst2:expr) => {
28-
concat!($inst1, " [{p:e}]", $inst2)
29-
};
30-
}
31-
#[cfg(target_pointer_width = "64")]
32-
macro_rules! vps {
33-
($inst1:expr, $inst2:expr) => {
34-
concat!($inst1, " [{p}]", $inst2)
35-
};
36-
}
37-
38-
pub(crate) use {vpl, vps};
39-
409
#[cfg(test)]
4110
use stdarch_test::assert_instr;
4211

@@ -27899,8 +27868,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
2789927868
#[allow(clippy::cast_ptr_alignment)]
2790027869
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2790127870
crate::arch::asm!(
27902-
"vmovntps [{mem_addr}], {a}",
27903-
mem_addr = in(reg) mem_addr,
27871+
vps!("vmovntps", ",{a}"),
27872+
p = in(reg) mem_addr,
2790427873
a = in(zmm_reg) a,
2790527874
options(nostack, preserves_flags),
2790627875
);
@@ -27925,8 +27894,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2792527894
#[allow(clippy::cast_ptr_alignment)]
2792627895
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2792727896
crate::arch::asm!(
27928-
"vmovntpd [{mem_addr}], {a}",
27929-
mem_addr = in(reg) mem_addr,
27897+
vps!("vmovntpd", ",{a}"),
27898+
p = in(reg) mem_addr,
2793027899
a = in(zmm_reg) a,
2793127900
options(nostack, preserves_flags),
2793227901
);
@@ -27951,13 +27920,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2795127920
#[allow(clippy::cast_ptr_alignment)]
2795227921
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) {
2795327922
crate::arch::asm!(
27954-
"vmovntdq [{mem_addr}], {a}",
27955-
mem_addr = in(reg) mem_addr,
27923+
vps!("vmovntdq", ",{a}"),
27924+
p = in(reg) mem_addr,
2795627925
a = in(zmm_reg) a,
2795727926
options(nostack, preserves_flags),
2795827927
);
2795927928
}
2796027929

27930+
/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
27931+
/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
27932+
/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
27933+
///
27934+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256)
27935+
#[inline]
27936+
#[target_feature(enable = "avx512f")]
27937+
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
27938+
pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
27939+
let dst: __m512i;
27940+
crate::arch::asm!(
27941+
vpl!("vmovntdqa {a}"),
27942+
a = out(zmm_reg) dst,
27943+
p = in(reg) mem_addr,
27944+
options(pure, readonly, nostack, preserves_flags),
27945+
);
27946+
dst
27947+
}
27948+
2796127949
/// Sets packed 32-bit integers in `dst` with the supplied values.
2796227950
///
2796327951
/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
@@ -54566,6 +54554,13 @@ mod tests {
5456654554
}
5456754555
}
5456854556

54557+
#[simd_test(enable = "avx512f")]
54558+
unsafe fn test_mm512_stream_load_si512() {
54559+
let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
54560+
let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
54561+
assert_eq_m512i(a, r);
54562+
}
54563+
5456954564
#[simd_test(enable = "avx512f")]
5457054565
unsafe fn test_mm512_reduce_add_epi32() {
5457154566
let a = _mm512_set1_epi32(1);

crates/core_arch/src/x86/macros.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,33 @@ macro_rules! assert_approx_eq {
5757
);
5858
}};
5959
}
60+
61+
// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
62+
// register name (e.g. rax). We have to explicitly override the placeholder to
63+
// use the 32-bit register name in that case.
64+
65+
#[cfg(target_pointer_width = "32")]
66+
macro_rules! vpl {
67+
($inst:expr) => {
68+
concat!($inst, ", [{p:e}]")
69+
};
70+
}
71+
#[cfg(target_pointer_width = "64")]
72+
macro_rules! vpl {
73+
($inst:expr) => {
74+
concat!($inst, ", [{p}]")
75+
};
76+
}
77+
78+
#[cfg(target_pointer_width = "32")]
79+
macro_rules! vps {
80+
($inst1:expr, $inst2:expr) => {
81+
concat!($inst1, " [{p:e}]", $inst2)
82+
};
83+
}
84+
#[cfg(target_pointer_width = "64")]
85+
macro_rules! vps {
86+
($inst1:expr, $inst2:expr) => {
87+
concat!($inst1, " [{p}]", $inst2)
88+
};
89+
}

crates/core_arch/src/x86/sse.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1992,8 +1992,8 @@ extern "C" {
19921992
#[allow(clippy::cast_ptr_alignment)]
19931993
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
19941994
crate::arch::asm!(
1995-
"movntps [{mem_addr}], {a}",
1996-
mem_addr = in(reg) mem_addr,
1995+
vps!("movntps", ",{a}"),
1996+
p = in(reg) mem_addr,
19971997
a = in(xmm_reg) a,
19981998
options(nostack, preserves_flags),
19991999
);

crates/core_arch/src/x86/sse2.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,8 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
13121312
#[stable(feature = "simd_x86", since = "1.27.0")]
13131313
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
13141314
crate::arch::asm!(
1315-
"movntdq [{mem_addr}], {a}",
1316-
mem_addr = in(reg) mem_addr,
1315+
vps!("movntdq", ",{a}"),
1316+
p = in(reg) mem_addr,
13171317
a = in(xmm_reg) a,
13181318
options(nostack, preserves_flags),
13191319
);
@@ -1339,8 +1339,8 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
13391339
#[stable(feature = "simd_x86", since = "1.27.0")]
13401340
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
13411341
crate::arch::asm!(
1342-
"movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
1343-
mem_addr = in(reg) mem_addr,
1342+
vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1343+
p = in(reg) mem_addr,
13441344
a = in(reg) a,
13451345
options(nostack, preserves_flags),
13461346
);
@@ -2542,8 +2542,8 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
25422542
#[allow(clippy::cast_ptr_alignment)]
25432543
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
25442544
crate::arch::asm!(
2545-
"movntpd [{mem_addr}], {a}",
2546-
mem_addr = in(reg) mem_addr,
2545+
vps!("movntpd", ",{a}"),
2546+
p = in(reg) mem_addr,
25472547
a = in(xmm_reg) a,
25482548
options(nostack, preserves_flags),
25492549
);

crates/core_arch/src/x86/sse41.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,9 +1154,9 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
11541154
pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
11551155
let dst: __m128i;
11561156
crate::arch::asm!(
1157-
"movntdqa {a}, [{mem_addr}]",
1157+
vpl!("movntdqa {a}"),
11581158
a = out(xmm_reg) dst,
1159-
mem_addr = in(reg) mem_addr,
1159+
p = in(reg) mem_addr,
11601160
options(pure, readonly, nostack, preserves_flags),
11611161
);
11621162
dst

crates/core_arch/src/x86_64/sse2.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
7979
#[stable(feature = "simd_x86", since = "1.27.0")]
8080
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
8181
crate::arch::asm!(
82-
"movnti [{mem_addr}], {a}",
83-
mem_addr = in(reg) mem_addr,
82+
"movnti [{p}], {a}",
83+
p = in(reg) mem_addr,
8484
a = in(reg) a,
8585
options(nostack, preserves_flags),
8686
);

0 commit comments

Comments
 (0)