AArch64: Optimize ipred_v_8bpc_neon

Optimize the width = 4 case of ipred_v_8bpc_neon by using simple stores instead of the lane stores which can improve performance on some CPUs. Relative runtime after this patch on some Cortex CPUs: ipred_v: w4 Cortex-A55: 1.041x Cortex-A510: 0.297x Cortex-A520: 0.748x Cortex-A76: 0.866x Cortex-A78: 0.856x Cortex-A715: 0.874x Cortex-A720: 0.875x Cortex-A725: 0.868x Cortex-X1: 1.013x Cortex-X3: 1.000x Cortex-X925: 1.000x
2026-06-11 04:03:05 +00:00 · 2026-04-15 17:37:46 +02:00
parent aa4504729c
commit 47e2607e6c
1 changed files with 8 additions and 6 deletions
@@ -114,7 +114,7 @@ function ipred_v_8bpc_neon, export=1
        clz             w3,  w3
        movrel          x5,  ipred_v_tbl
        sub             w3,  w3,  #25
-        ldrsw           x3,  [x5, w3, uxtw #2]
+        ldrsw           x3,  [x5, x3, lsl #2]
        add             x2,  x2,  #1
        add             x5,  x5,  x3
        add             x6,  x0,  x1
@@ -122,13 +122,15 @@ function ipred_v_8bpc_neon, export=1
        br              x5
 40:
        AARCH64_VALID_JUMP_TARGET
-        ld1             {v0.s}[0],  [x2]
+        ldr             s0,  [x2]
 4:
-        st1             {v0.s}[0],  [x0], x1
-        st1             {v0.s}[0],  [x6], x1
+        str             s0,  [x0]
        subs            w4,  w4,  #4
-        st1             {v0.s}[0],  [x0], x1
-        st1             {v0.s}[0],  [x6], x1
+        str             s0,  [x6]
+        str             s0,  [x0, x1]
+        add             x0,  x0,  x1,  lsl #1
+        str             s0,  [x6, x1]
+        add             x6,  x6,  x1,  lsl #1
        b.gt            4b
        ret
 80: