Skip to content

use splat for the aarch64/arm dup intrinsics #1857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 20 additions & 40 deletions crates/core_arch/src/arm_shared/neon/generated.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14322,8 +14322,7 @@ pub unsafe fn vld1q_dup_f16(ptr: *const f16) -> float16x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
let x = vld1_lane_f32::<0>(ptr, transmute(f32x2::splat(0.0)));
simd_shuffle!(x, x, [0, 0])
transmute(f32x2::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_p16)"]
Expand All @@ -14346,8 +14345,7 @@ pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
let x = vld1_lane_p16::<0>(ptr, transmute(u16x4::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0])
transmute(u16x4::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_p8)"]
Expand All @@ -14370,8 +14368,7 @@ pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
let x = vld1_lane_p8::<0>(ptr, transmute(u8x8::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
transmute(u8x8::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s16)"]
Expand All @@ -14394,8 +14391,7 @@ pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
let x = vld1_lane_s16::<0>(ptr, transmute(i16x4::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0])
transmute(i16x4::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s32)"]
Expand All @@ -14418,8 +14414,7 @@ pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
let x = vld1_lane_s32::<0>(ptr, transmute(i32x2::splat(0)));
simd_shuffle!(x, x, [0, 0])
transmute(i32x2::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_s8)"]
Expand All @@ -14442,8 +14437,7 @@ pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
let x = vld1_lane_s8::<0>(ptr, transmute(i8x8::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
transmute(i8x8::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u16)"]
Expand All @@ -14466,8 +14460,7 @@ pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
let x = vld1_lane_u16::<0>(ptr, transmute(u16x4::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0])
transmute(u16x4::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u32)"]
Expand All @@ -14490,8 +14483,7 @@ pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
let x = vld1_lane_u32::<0>(ptr, transmute(u32x2::splat(0)));
simd_shuffle!(x, x, [0, 0])
transmute(u32x2::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_u8)"]
Expand All @@ -14514,8 +14506,7 @@ pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
let x = vld1_lane_u8::<0>(ptr, transmute(u8x8::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
transmute(u8x8::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_f32)"]
Expand All @@ -14538,8 +14529,7 @@ pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
let x = vld1q_lane_f32::<0>(ptr, transmute(f32x4::splat(0.0)));
simd_shuffle!(x, x, [0, 0, 0, 0])
transmute(f32x4::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_p16)"]
Expand All @@ -14562,8 +14552,7 @@ pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
let x = vld1q_lane_p16::<0>(ptr, transmute(u16x8::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
transmute(u16x8::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_p8)"]
Expand All @@ -14586,8 +14575,7 @@ pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
let x = vld1q_lane_p8::<0>(ptr, transmute(u8x16::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
transmute(u8x16::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s16)"]
Expand All @@ -14610,8 +14598,7 @@ pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
let x = vld1q_lane_s16::<0>(ptr, transmute(i16x8::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
transmute(i16x8::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s32)"]
Expand All @@ -14634,8 +14621,7 @@ pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
let x = vld1q_lane_s32::<0>(ptr, transmute(i32x4::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0])
transmute(i32x4::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s64)"]
Expand All @@ -14658,8 +14644,7 @@ pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
let x = vld1q_lane_s64::<0>(ptr, transmute(i64x2::splat(0)));
simd_shuffle!(x, x, [0, 0])
transmute(i64x2::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_s8)"]
Expand All @@ -14682,8 +14667,7 @@ pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
let x = vld1q_lane_s8::<0>(ptr, transmute(i8x16::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
transmute(i8x16::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u16)"]
Expand All @@ -14706,8 +14690,7 @@ pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
let x = vld1q_lane_u16::<0>(ptr, transmute(u16x8::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
transmute(u16x8::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u32)"]
Expand All @@ -14730,8 +14713,7 @@ pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
let x = vld1q_lane_u32::<0>(ptr, transmute(u32x4::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0])
transmute(u32x4::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u64)"]
Expand All @@ -14754,8 +14736,7 @@ pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
let x = vld1q_lane_u64::<0>(ptr, transmute(u64x2::splat(0)));
simd_shuffle!(x, x, [0, 0])
transmute(u64x2::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_dup_u8)"]
Expand All @@ -14778,8 +14759,7 @@ pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
)]
pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
let x = vld1q_lane_u8::<0>(ptr, transmute(u8x16::splat(0)));
simd_shuffle!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
transmute(u8x16::splat(*ptr))
}
#[doc = "Load one single-element structure and Replicate to all lanes (of one register)."]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_dup_p64)"]
Expand Down
51 changes: 24 additions & 27 deletions crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14138,6 +14138,7 @@ intrinsics:
doc: "Load one single-element structure and Replicate to all lanes (of one register)."
arguments: ["ptr: {type[1]}"]
return_type: "{neon_type[2]}"
big_endian_inverse: false
attr:
- *neon-v7
- FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']] } ]]
Expand All @@ -14147,40 +14148,36 @@ intrinsics:
safety:
unsafe: [neon]
types:
- ['vld1_dup_s8', '*const i8', 'int8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_s8::<0>', 'i8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1_dup_u8', '*const u8', 'uint8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_u8::<0>', 'u8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1_dup_p8', '*const p8', 'poly8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_p8::<0>', 'u8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1_dup_s8', '*const i8', 'int8x8_t', 'vld1.8', 'ld1r', 'i8x8::splat']
- ['vld1_dup_u8', '*const u8', 'uint8x8_t', 'vld1.8', 'ld1r', 'u8x8::splat']
- ['vld1_dup_p8', '*const p8', 'poly8x8_t', 'vld1.8', 'ld1r', 'u8x8::splat']

- ['vld1q_dup_s8', '*const i8', 'int8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_s8::<0>', 'i8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1q_dup_u8', '*const u8', 'uint8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_u8::<0>', 'u8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1q_dup_p8', '*const p8', 'poly8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_p8::<0>', 'u8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1q_dup_s8', '*const i8', 'int8x16_t', 'vld1.8', 'ld1r', 'i8x16::splat']
- ['vld1q_dup_u8', '*const u8', 'uint8x16_t', 'vld1.8', 'ld1r', 'u8x16::splat']
- ['vld1q_dup_p8', '*const p8', 'poly8x16_t', 'vld1.8', 'ld1r', 'u8x16::splat']

- ['vld1_dup_s16', '*const i16', 'int16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_s16::<0>', 'i16x4::splat(0)', '[0, 0, 0, 0]']
- ['vld1_dup_u16', '*const u16', 'uint16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_u16::<0>', 'u16x4::splat(0)', '[0, 0, 0, 0]']
- ['vld1_dup_p16', '*const p16', 'poly16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_p16::<0>', 'u16x4::splat(0)', '[0, 0, 0, 0]']
- ['vld1_dup_s16', '*const i16', 'int16x4_t', 'vld1.16', 'ld1r', 'i16x4::splat']
- ['vld1_dup_u16', '*const u16', 'uint16x4_t', 'vld1.16', 'ld1r', 'u16x4::splat']
- ['vld1_dup_p16', '*const p16', 'poly16x4_t', 'vld1.16', 'ld1r', 'u16x4::splat']

- ['vld1q_dup_s16', '*const i16', 'int16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_s16::<0>', 'i16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1q_dup_u16', '*const u16', 'uint16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_u16::<0>', 'u16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1q_dup_p16', '*const p16', 'poly16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_p16::<0>', 'u16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
- ['vld1q_dup_s16', '*const i16', 'int16x8_t', 'vld1.16', 'ld1r', 'i16x8::splat']
- ['vld1q_dup_u16', '*const u16', 'uint16x8_t', 'vld1.16', 'ld1r', 'u16x8::splat']
- ['vld1q_dup_p16', '*const p16', 'poly16x8_t', 'vld1.16', 'ld1r', 'u16x8::splat']

- ['vld1_dup_s32', '*const i32', 'int32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_s32::<0>', 'i32x2::splat(0)', '[0, 0]']
- ['vld1_dup_u32', '*const u32', 'uint32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_u32::<0>', 'u32x2::splat(0)', '[0, 0]']
- ['vld1_dup_f32', '*const f32', 'float32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_f32::<0>', 'f32x2::splat(0.0)', '[0, 0]']
- ['vld1_dup_s32', '*const i32', 'int32x2_t', 'vld1.32', 'ld1r', 'i32x2::splat']
- ['vld1_dup_u32', '*const u32', 'uint32x2_t', 'vld1.32', 'ld1r', 'u32x2::splat']
- ['vld1_dup_f32', '*const f32', 'float32x2_t', 'vld1.32', 'ld1r', 'f32x2::splat']

- ['vld1q_dup_s32', '*const i32', 'int32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_s32::<0>', 'i32x4::splat(0)', '[0, 0, 0, 0]']
- ['vld1q_dup_u32', '*const u32', 'uint32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_u32::<0>', 'u32x4::splat(0)', '[0, 0, 0, 0]']
- ['vld1q_dup_f32', '*const f32', 'float32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_f32::<0>', 'f32x4::splat(0.0)', '[0, 0, 0, 0]']
- ['vld1q_dup_s32', '*const i32', 'int32x4_t', 'vld1.32', 'ld1r', 'i32x4::splat']
- ['vld1q_dup_u32', '*const u32', 'uint32x4_t', 'vld1.32', 'ld1r', 'u32x4::splat']
- ['vld1q_dup_f32', '*const f32', 'float32x4_t', 'vld1.32', 'ld1r', 'f32x4::splat']

- ['vld1q_dup_s64', '*const i64', 'int64x2_t', 'vldr', 'ld1', 'vld1q_lane_s64::<0>', 'i64x2::splat(0)', '[0, 0]']
- ['vld1q_dup_u64', '*const u64', 'uint64x2_t', 'vldr', 'ld1', 'vld1q_lane_u64::<0>', 'u64x2::splat(0)', '[0, 0]']
- ['vld1q_dup_s64', '*const i64', 'int64x2_t', 'vldr', 'ld1', 'i64x2::splat']
- ['vld1q_dup_u64', '*const u64', 'uint64x2_t', 'vldr', 'ld1', 'u64x2::splat']
compose:
- Let:
- x
- FnCall:
- '{type[5]}'
- - ptr
- FnCall: [transmute, ['{type[6]}']]
- FnCall: ['simd_shuffle!', [x, x, '{type[7]}']]
- FnCall:
- transmute
- - FnCall: ['{type[5]}', ["*ptr"]]

- name: "{type[0]}"
doc: "Absolute difference and accumulate (64-bit)"
Expand Down