AArch64: Optimize ipred_v_8bpc_neon

Optimize the width = 4 case of ipred_v_8bpc_neon by using simple stores
instead of the lane stores which can improve performance on some CPUs.

Relative runtime after this patch on some Cortex CPUs:

 ipred_v:       w4
Cortex-A55:   1.041x
Cortex-A510:  0.297x
Cortex-A520:  0.748x
Cortex-A76:   0.866x
Cortex-A78:   0.856x
Cortex-A715:  0.874x
Cortex-A720:  0.875x
Cortex-A725:  0.868x
Cortex-X1:    1.013x
Cortex-X3:    1.000x
Cortex-X925:  1.000x
This commit is contained in:
Arpad Panyik
2026-04-15 17:37:46 +02:00
parent aa4504729c
commit 47e2607e6c
+8 -6
View File
@@ -114,7 +114,7 @@ function ipred_v_8bpc_neon, export=1
clz w3, w3
movrel x5, ipred_v_tbl
sub w3, w3, #25
ldrsw x3, [x5, w3, uxtw #2]
ldrsw x3, [x5, x3, lsl #2]
add x2, x2, #1
add x5, x5, x3
add x6, x0, x1
@@ -122,13 +122,15 @@ function ipred_v_8bpc_neon, export=1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.s}[0], [x2]
ldr s0, [x2]
4:
st1 {v0.s}[0], [x0], x1
st1 {v0.s}[0], [x6], x1
str s0, [x0]
subs w4, w4, #4
st1 {v0.s}[0], [x0], x1
st1 {v0.s}[0], [x6], x1
str s0, [x6]
str s0, [x0, x1]
add x0, x0, x1, lsl #1
str s0, [x6, x1]
add x6, x6, x1, lsl #1
b.gt 4b
ret
80: