mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
arm: mc: Optimize prep_neon for the w4/w8 cases
Use alternating registers for immediately sequential loads/stores, pack two 4 pixel rows into one register. Before: Cortex A7 A8 A53 A55 A72 A73 A76 mct_8tap_regular_w4_0_8bpc_neon: 112.0 68.6 79.7 82.9 45.3 39.4 24.4 mct_8tap_regular_w8_0_8bpc_neon: 158.2 89.5 108.4 113.4 55.4 53.0 30.0 After: mct_8tap_regular_w4_0_8bpc_neon: 89.7 69.9 76.3 85.1 36.2 35.2 25.0 mct_8tap_regular_w8_0_8bpc_neon: 149.0 92.7 102.6 115.8 56.6 52.8 31.4 The numbers aren't entirely consistent, but this is mostly favourable.
This commit is contained in:
+12
-7
@@ -948,21 +948,26 @@ L(prep_tbl):
|
||||
.word 640f - L(prep_tbl) + CONFIG_THUMB
|
||||
.word 320f - L(prep_tbl) + CONFIG_THUMB
|
||||
.word 160f - L(prep_tbl) + CONFIG_THUMB
|
||||
.word 8f - L(prep_tbl) + CONFIG_THUMB
|
||||
.word 4f - L(prep_tbl) + CONFIG_THUMB
|
||||
.word 80f - L(prep_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(prep_tbl) + CONFIG_THUMB
|
||||
|
||||
40:
|
||||
add r9, r1, r2
|
||||
lsl r2, r2, #1
|
||||
4:
|
||||
vld1.32 {d0[]}, [r1], r2
|
||||
vld1.32 {d2[]}, [r1], r2
|
||||
vld1.32 {d0[]}, [r1], r2
|
||||
vld1.32 {d0[1]}, [r9], r2
|
||||
subs r4, r4, #2
|
||||
vshll.u8 q0, d0, #4
|
||||
vshll.u8 q1, d2, #4
|
||||
vst1.16 {d1, d2}, [r0, :64]!
|
||||
vst1.16 {d0, d1}, [r0, :64]!
|
||||
bgt 4b
|
||||
pop {r4-r11,pc}
|
||||
80:
|
||||
add r9, r1, r2
|
||||
lsl r2, r2, #1
|
||||
8:
|
||||
vld1.8 {d0}, [r1], r2
|
||||
vld1.8 {d2}, [r1], r2
|
||||
vld1.8 {d2}, [r9], r2
|
||||
subs r4, r4, #2
|
||||
vshll.u8 q0, d0, #4
|
||||
vshll.u8 q1, d2, #4
|
||||
|
||||
Reference in New Issue
Block a user