Skip to content

Commit c5e38b6

Browse files
committed
use intrinsics::simd for interleaving store of f16
1 parent 045da5d commit c5e38b6

3 files changed

Lines changed: 23 additions & 82 deletions

File tree

crates/core_arch/src/aarch64/neon/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,6 +1050,14 @@ mod tests {
10501050
test_vld1q_f16_x2(f16, 16, float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2);
10511051
test_vld1q_f16_x3(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
10521052
test_vld1q_f16_x4(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);
1053+
1054+
test_vld2_f16_x2(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
1055+
test_vld2_f16_x3(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
1056+
test_vld2_f16_x4(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);
1057+
1058+
test_vld2q_f16_x2(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
1059+
test_vld3q_f16_x3(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
1060+
test_vld4q_f16_x4(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
10531061
}
10541062

10551063
macro_rules! wide_store_load_roundtrip_aes {

crates/core_arch/src/arm_shared/neon/generated.rs

Lines changed: 6 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -65923,14 +65923,7 @@ pub unsafe fn vst1q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2_t) {
6592365923
#[cfg(not(target_arch = "arm64ec"))]
6592465924
#[cfg_attr(test, assert_instr(st2))]
6592565925
pub unsafe fn vst2_f16(a: *mut f16, b: float16x4x2_t) {
65926-
unsafe extern "unadjusted" {
65927-
#[cfg_attr(
65928-
any(target_arch = "aarch64", target_arch = "arm64ec"),
65929-
link_name = "llvm.aarch64.neon.st2.v4f16.p0"
65930-
)]
65931-
fn _vst2_f16(a: float16x4_t, b: float16x4_t, ptr: *mut i8);
65932-
}
65933-
_vst2_f16(b.0, b.1, a as _)
65926+
crate::core_arch::macros::interleaving_store!(f16, 4, 2, a, b)
6593465927
}
6593565928
#[doc = "Store multiple 2-element structures from two registers"]
6593665929
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f16)"]
@@ -65944,14 +65937,7 @@ pub unsafe fn vst2_f16(a: *mut f16, b: float16x4x2_t) {
6594465937
#[cfg(not(target_arch = "arm64ec"))]
6594565938
#[cfg_attr(test, assert_instr(st2))]
6594665939
pub unsafe fn vst2q_f16(a: *mut f16, b: float16x8x2_t) {
65947-
unsafe extern "unadjusted" {
65948-
#[cfg_attr(
65949-
any(target_arch = "aarch64", target_arch = "arm64ec"),
65950-
link_name = "llvm.aarch64.neon.st2.v8f16.p0"
65951-
)]
65952-
fn _vst2q_f16(a: float16x8_t, b: float16x8_t, ptr: *mut i8);
65953-
}
65954-
_vst2q_f16(b.0, b.1, a as _)
65940+
crate::core_arch::macros::interleaving_store!(f16, 8, 2, a, b)
6595565941
}
6595665942
#[doc = "Store multiple 2-element structures from two registers"]
6595765943
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f16)"]
@@ -67089,11 +67075,7 @@ pub unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t) {
6708967075
#[cfg(not(target_arch = "arm64ec"))]
6709067076
#[cfg_attr(test, assert_instr(vst3))]
6709167077
pub unsafe fn vst3_f16(a: *mut f16, b: float16x4x3_t) {
67092-
unsafe extern "unadjusted" {
67093-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4f16")]
67094-
fn _vst3_f16(ptr: *mut i8, a: float16x4_t, b: float16x4_t, c: float16x4_t, size: i32);
67095-
}
67096-
_vst3_f16(a as _, b.0, b.1, b.2, 2)
67078+
crate::core_arch::macros::interleaving_store!(f16, 4, 3, a, b)
6709767079
}
6709867080
#[doc = "Store multiple 3-element structures from three registers"]
6709967081
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f16)"]
@@ -67108,11 +67090,7 @@ pub unsafe fn vst3_f16(a: *mut f16, b: float16x4x3_t) {
6710867090
#[cfg(not(target_arch = "arm64ec"))]
6710967091
#[cfg_attr(test, assert_instr(vst3))]
6711067092
pub unsafe fn vst3q_f16(a: *mut f16, b: float16x8x3_t) {
67111-
unsafe extern "unadjusted" {
67112-
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v8f16")]
67113-
fn _vst3q_f16(ptr: *mut i8, a: float16x8_t, b: float16x8_t, c: float16x8_t, size: i32);
67114-
}
67115-
_vst3q_f16(a as _, b.0, b.1, b.2, 2)
67093+
crate::core_arch::macros::interleaving_store!(f16, 8, 3, a, b)
6711667094
}
6711767095
#[doc = "Store multiple 3-element structures from three registers"]
6711867096
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f16)"]
@@ -68385,14 +68363,7 @@ pub unsafe fn vst4q_f16(a: *mut f16, b: float16x8x4_t) {
6838568363
#[cfg(not(target_arch = "arm64ec"))]
6838668364
#[cfg_attr(test, assert_instr(st4))]
6838768365
pub unsafe fn vst4_f16(a: *mut f16, b: float16x4x4_t) {
68388-
unsafe extern "unadjusted" {
68389-
#[cfg_attr(
68390-
any(target_arch = "aarch64", target_arch = "arm64ec"),
68391-
link_name = "llvm.aarch64.neon.st4.v4f16.p0"
68392-
)]
68393-
fn _vst4_f16(a: float16x4_t, b: float16x4_t, c: float16x4_t, d: float16x4_t, ptr: *mut i8);
68394-
}
68395-
_vst4_f16(b.0, b.1, b.2, b.3, a as _)
68366+
crate::core_arch::macros::interleaving_store!(f16, 4, 4, a, b)
6839668367
}
6839768368
#[doc = "Store multiple 4-element structures from four registers"]
6839868369
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f16)"]
@@ -68406,14 +68377,7 @@ pub unsafe fn vst4_f16(a: *mut f16, b: float16x4x4_t) {
6840668377
#[cfg(not(target_arch = "arm64ec"))]
6840768378
#[cfg_attr(test, assert_instr(st4))]
6840868379
pub unsafe fn vst4q_f16(a: *mut f16, b: float16x8x4_t) {
68409-
unsafe extern "unadjusted" {
68410-
#[cfg_attr(
68411-
any(target_arch = "aarch64", target_arch = "arm64ec"),
68412-
link_name = "llvm.aarch64.neon.st4.v8f16.p0"
68413-
)]
68414-
fn _vst4q_f16(a: float16x8_t, b: float16x8_t, c: float16x8_t, d: float16x8_t, ptr: *mut i8);
68415-
}
68416-
_vst4q_f16(b.0, b.1, b.2, b.3, a as _)
68380+
crate::core_arch::macros::interleaving_store!(f16, 8, 4, a, b)
6841768381
}
6841868382
#[doc = "Store multiple 4-element structures from four registers"]
6841968383
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f32)"]

crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml

Lines changed: 9 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5117,19 +5117,10 @@ intrinsics:
51175117
safety:
51185118
unsafe: [neon]
51195119
types:
5120-
- [f16, float16x4x2_t, float16x4_t]
5121-
- [f16, float16x8x2_t, float16x8_t]
5120+
- [f16, float16x4x2_t, "4"]
5121+
- [f16, float16x8x2_t, "8"]
51225122
compose:
5123-
- LLVMLink:
5124-
name: 'st2.{neon_type[1]}'
5125-
arguments:
5126-
- 'a: {type[2]}'
5127-
- 'b: {type[2]}'
5128-
- 'ptr: *mut i8'
5129-
links:
5130-
- link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
5131-
arch: aarch64,arm64ec
5132-
- FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
5123+
- FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "2", a, b], [], true]
51335124

51345125

51355126
- name: "vst2{neon_type[1].nox}"
@@ -5546,21 +5537,10 @@ intrinsics:
55465537
safety:
55475538
unsafe: [neon]
55485539
types:
5549-
- [f16, float16x4x3_t, float16x4_t, '2']
5550-
- [f16, float16x8x3_t, float16x8_t, '2']
5540+
- [f16, float16x4x3_t, "4"]
5541+
- [f16, float16x8x3_t, "8"]
55515542
compose:
5552-
- LLVMLink:
5553-
name: 'vst3.{neon_type[1]}'
5554-
arguments:
5555-
- 'ptr: *mut i8'
5556-
- 'a: {type[2]}'
5557-
- 'b: {type[2]}'
5558-
- 'c: {type[2]}'
5559-
- 'size: i32'
5560-
links:
5561-
- link: 'llvm.arm.neon.vst3.p0.v{neon_type[1].lane}{type[0]}'
5562-
arch: arm
5563-
- FnCall: ['_vst3{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', "{type[3]}"]]
5543+
- FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "3", a, b], [], true]
55645544

55655545

55665546
- name: "vst3{neon_type[1].lane_nox}"
@@ -6054,21 +6034,10 @@ intrinsics:
60546034
safety:
60556035
unsafe: [neon]
60566036
types:
6057-
- [f16, float16x4x4_t, float16x4_t]
6058-
- [f16, float16x8x4_t, float16x8_t]
6037+
- [f16, float16x4x4_t, "4"]
6038+
- [f16, float16x8x4_t, "8"]
60596039
compose:
6060-
- LLVMLink:
6061-
name: 'vst4.{neon_type[1]}'
6062-
arguments:
6063-
- 'a: {type[2]}'
6064-
- 'b: {type[2]}'
6065-
- 'c: {type[2]}'
6066-
- 'd: {type[2]}'
6067-
- 'ptr: *mut i8'
6068-
links:
6069-
- link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
6070-
arch: aarch64,arm64ec
6071-
- FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
6040+
- FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "4", a, b], [], true]
60726041

60736042

60746043
- name: "vst4{neon_type[1].lane_nox}"

0 commit comments

Comments
 (0)