arm: mc: Optimize prep_neon for the w4/w8 cases

Use alternating registers for immediately sequential loads/stores,
pack two 4 pixel rows into one register.

Before:                           Cortex A7      A8     A53     A55     A72     A73     A76
mct_8tap_regular_w4_0_8bpc_neon:      112.0    68.6    79.7    82.9    45.3    39.4    24.4
mct_8tap_regular_w8_0_8bpc_neon:      158.2    89.5   108.4   113.4    55.4    53.0    30.0
After:
mct_8tap_regular_w4_0_8bpc_neon:       89.7    69.9    76.3    85.1    36.2    35.2    25.0
mct_8tap_regular_w8_0_8bpc_neon:      149.0    92.7   102.6   115.8    56.6    52.8    31.4

The numbers aren't entirely consistent, but this is mostly favourable.
This commit is contained in:
Martin Storsjö
2026-04-29 15:56:01 +03:00
parent 727d0f984b
commit 5cfc383268
+12 -7
View File
@@ -948,21 +948,26 @@ L(prep_tbl):
.word 640f - L(prep_tbl) + CONFIG_THUMB
.word 320f - L(prep_tbl) + CONFIG_THUMB
.word 160f - L(prep_tbl) + CONFIG_THUMB
.word 8f - L(prep_tbl) + CONFIG_THUMB
.word 4f - L(prep_tbl) + CONFIG_THUMB
.word 80f - L(prep_tbl) + CONFIG_THUMB
.word 40f - L(prep_tbl) + CONFIG_THUMB
40:
add r9, r1, r2
lsl r2, r2, #1
4:
vld1.32 {d0[]}, [r1], r2
vld1.32 {d2[]}, [r1], r2
vld1.32 {d0[]}, [r1], r2
vld1.32 {d0[1]}, [r9], r2
subs r4, r4, #2
vshll.u8 q0, d0, #4
vshll.u8 q1, d2, #4
vst1.16 {d1, d2}, [r0, :64]!
vst1.16 {d0, d1}, [r0, :64]!
bgt 4b
pop {r4-r11,pc}
80:
add r9, r1, r2
lsl r2, r2, #1
8:
vld1.8 {d0}, [r1], r2
vld1.8 {d2}, [r1], r2
vld1.8 {d2}, [r9], r2
subs r4, r4, #2
vshll.u8 q0, d0, #4
vshll.u8 q1, d2, #4