AArch64: Optimize ipred_h_8bpc_neon

Optimize ipred_h_8bpc_neon using simpler stores and simpler indexing. Relative runtime after this patch on some Cortex CPUs: ipred_h: w4 w8 w16 w32 w64 Cortex-A55: 1.054x 1.054x 0.978x 1.149x 1.097x Cortex-A510: 0.455x 0.970x 0.973x 1.010x 1.002x Cortex-A520: 0.973x 0.975x 0.979x 1.002x 1.000x Cortex-A76: 0.791x 0.934x 0.912x 1.010x 0.999x Cortex-A78: 0.771x 0.933x 0.957x 0.519x 0.510x Cortex-A715: 0.838x 0.860x 0.893x 0.585x 0.661x Cortex-A720: 0.839x 0.860x 0.892x 0.580x 0.659x Cortex-A725: 0.809x 0.837x 0.871x 0.580x 0.660x Cortex-X1: 0.973x 0.982x 0.989x 0.498x 0.660x Cortex-X3: 0.971x 0.992x 0.987x 0.495x 0.661x Cortex-X925: 0.950x 1.000x 1.000x 0.474x 0.655x
2026-06-11 04:03:05 +00:00 · 2026-04-16 16:02:28 +02:00
parent 47e2607e6c
commit c5726277ff
1 changed files with 37 additions and 32 deletions
@@ -195,78 +195,83 @@ function ipred_h_8bpc_neon, export=1
        clz             w3,  w3
        movrel          x5,  ipred_h_tbl
        sub             w3,  w3,  #25
-        ldrsw           x3,  [x5, w3, uxtw #2]
        sub             x2,  x2,  #4
-        add             x5,  x5,  x3
+        ldrsw           x3,  [x5, x3, lsl #2]
        mov             x7,  #-4
-        add             x6,  x0,  x1
-        lsl             x1,  x1,  #1
+        add             x6,  x1,  x1            // 2 * stride, 4..16 blocks only
+        add             x5,  x5,  x3
+        add             x8,  x1,  x1,  lsl #1   // 3 * stride, 4..16 blocks only
        br              x5
 40:
        AARCH64_VALID_JUMP_TARGET
 4:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
-        st1             {v3.s}[0],  [x0], x1
-        st1             {v2.s}[0],  [x6], x1
        subs            w4,  w4,  #4
-        st1             {v1.s}[0],  [x0], x1
-        st1             {v0.s}[0],  [x6], x1
+        str             s3,  [x0]
+        str             s2,  [x0, x1]
+        str             s1,  [x0, x6]
+        str             s0,  [x0, x8]
+        add             x0,  x0,  x1,  lsl #2
        b.gt            4b
        ret
 80:
        AARCH64_VALID_JUMP_TARGET
 8:
        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
-        st1             {v3.8b},  [x0], x1
-        st1             {v2.8b},  [x6], x1
        subs            w4,  w4,  #4
-        st1             {v1.8b},  [x0], x1
-        st1             {v0.8b},  [x6], x1
+        str             d3,  [x0]
+        str             d2,  [x0, x1]
+        str             d1,  [x0, x6]
+        str             d0,  [x0, x8]
+        add             x0,  x0,  x1,  lsl #2
        b.gt            8b
        ret
 160:
        AARCH64_VALID_JUMP_TARGET
 16:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
-        st1             {v3.16b}, [x0], x1
-        st1             {v2.16b}, [x6], x1
        subs            w4,  w4,  #4
-        st1             {v1.16b}, [x0], x1
-        st1             {v0.16b}, [x6], x1
+        str             q3,  [x0]
+        str             q2,  [x0, x1]
+        str             q1,  [x0, x6]
+        str             q0,  [x0, x8]
+        add             x0,  x0,  x1,  lsl #2
        b.gt            16b
        ret
 320:
        AARCH64_VALID_JUMP_TARGET
+        add             x6,  x0,  x1
 32:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
-        str             q3,  [x0, #16]
-        str             q2,  [x6, #16]
-        st1             {v3.16b}, [x0], x1
-        st1             {v2.16b}, [x6], x1
+        stp             q3,  q3,  [x0]
+        add             x0,  x0,  x1,  lsl #1
+        stp             q2,  q2,  [x6]
+        add             x6,  x6,  x1,  lsl #1
        subs            w4,  w4,  #4
-        str             q1,  [x0, #16]
-        str             q0,  [x6, #16]
-        st1             {v1.16b}, [x0], x1
-        st1             {v0.16b}, [x6], x1
+        stp             q1,  q1,  [x0]
+        add             x0,  x0,  x1,  lsl #1
+        stp             q0,  q0,  [x6]
+        add             x6,  x6,  x1,  lsl #1
        b.gt            32b
        ret
 640:
        AARCH64_VALID_JUMP_TARGET
+        add             x6,  x0,  x1
 64:
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
-        str             q3,  [x0, #16]
-        str             q2,  [x6, #16]
+        stp             q3,  q3,  [x0]
        stp             q3,  q3,  [x0, #32]
+        add             x0,  x0,  x1,  lsl #1
+        stp             q2,  q2,  [x6]
        stp             q2,  q2,  [x6, #32]
-        st1             {v3.16b}, [x0], x1
-        st1             {v2.16b}, [x6], x1
+        add             x6,  x6,  x1,  lsl #1
        subs            w4,  w4,  #4
-        str             q1,  [x0, #16]
-        str             q0,  [x6, #16]
+        stp             q1,  q1,  [x0]
        stp             q1,  q1,  [x0, #32]
+        add             x0,  x0,  x1,  lsl #1
+        stp             q0,  q0,  [x6]
        stp             q0,  q0,  [x6, #32]
-        st1             {v1.16b}, [x0], x1
-        st1             {v0.16b}, [x6], x1
+        add             x6,  x6,  x1,  lsl #1
        b.gt            64b
        ret
 endfunc