AArch64: Optimize ipred_smooth_h_8bpc_neon further

Optimize ipred_smooth_h_8bpc_neon even further using vertical inner loop for w >= 16 cases. Reorder instructions in the w = 4 handler for Small CPUs. Relative runtime after this patch on some Cortex CPUs: ipred_smooth_h: w4 w8 w16 w32 w64 Cortex-A55: 0.964x 1.003x 0.891x 0.979x 1.030x Cortex-A510: 0.952x 0.936x 0.928x 1.004x 1.050x Cortex-A520: 0.921x 0.925x 0.921x 0.995x 1.032x Cortex-A76: 0.993x 1.005x 0.977x 0.995x 0.996x Cortex-A78: 0.991x 0.998x 1.042x 0.978x 1.015x Cortex-A710: 1.020x 0.966x 1.015x 1.015x 1.008x Cortex-A715: 1.026x 1.051x 1.039x 1.007x 1.024x Cortex-A720: 0.954x 0.999x 1.018x 0.999x 1.020x Cortex-A725: 0.962x 1.000x 1.018x 1.000x 1.021x Cortex-X1: 1.019x 0.993x 0.924x 0.983x 0.989x Cortex-X2: 1.013x 0.991x 0.872x 0.964x 1.023x Cortex-X3: 1.030x 0.996x 0.840x 0.953x 1.024x Cortex-X4: 1.026x 1.005x 0.952x 0.970x 0.986x Cortex-X925: 1.000x 0.980x 0.865x 0.899x 0.892x
2026-06-11 04:03:05 +00:00 · 2026-05-20 13:44:22 +02:00
parent 1718ff9ade
commit a38236491a
1 changed files with 46 additions and 52 deletions
@@ -1231,27 +1231,27 @@ function ipred_smooth_h_8bpc_neon, export=1
        ld1r            {v5.16b},  [x12]          // right
        movi            v31.8b,  #128
        add             x8,  x8,  w3,  uxtw
-        add             x6,  x0,  x1
-        lsl             x1,  x1,  #1
        zip1            v31.16b, v31.16b, v5.16b  // right*256 + rnd
        cmp             w3,  #8
        b.gt            160f
        b.eq            80f
 40:
        ld1r            {v7.2s},  [x8]            // weights_hor
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
        uxtl            v7.8h,   v7.8b            // weights_hor
 4:
        ldr             s0,  [x2, #-4]!           // left
        zip1            v0.8b,   v0.8b,   v0.8b
        zip1            v0.16b,  v0.16b,  v0.16b  // replicate left[1..4]
-        usubl           v21.8h,  v0.8b,   v5.8b
-        usubl2          v20.8h,  v0.16b,  v5.16b  // left-right
-        mul             v21.8h,  v21.8h,  v7.8h
+        usubl2          v20.8h,  v0.16b,  v5.16b  // left-right, upper part, for flipped stores
+        usubl           v21.8h,  v0.8b,   v5.8b   // left-right, lower part, stored later
        mul             v20.8h,  v20.8h,  v7.8h   // right*256  + (left-right)*weights_hor
-        addhn           v21.8b,  v21.8h,  v31.8h
+        mul             v21.8h,  v21.8h,  v7.8h
        addhn           v20.8b,  v20.8h,  v31.8h
+        addhn           v21.8b,  v21.8h,  v31.8h
        subs            w4,  w4,  #4
-        st1             {v20.s}[1], [x0], x1
+        st1             {v20.s}[1], [x0], x1      // complete flip at store
        st1             {v20.s}[0], [x6], x1
        st1             {v21.s}[1], [x0], x1
        st1             {v21.s}[0], [x6], x1
@@ -1259,14 +1259,16 @@ function ipred_smooth_h_8bpc_neon, export=1
        ret
 80:
        ld1             {v7.8b},  [x8]            // weights_hor
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
        uxtl            v7.8h,   v7.8b            // weights_hor
 8:
        ldr             s0,  [x2, #-4]!           // left
        usubl           v0.8h,   v0.8b,   v5.8b
-        mul             v20.8h,   v7.8h,  v0.h[3] // right*256  + (left-right)*weights_hor
-        mul             v21.8h,   v7.8h,  v0.h[2] // (left flipped)
-        mul             v22.8h,   v7.8h,  v0.h[1]
-        mul             v23.8h,   v7.8h,  v0.h[0]
+        mul             v20.8h,  v7.8h,   v0.h[3] // right*256  + (left-right)*weights_hor
+        mul             v21.8h,  v7.8h,   v0.h[2] // (left flipped)
+        mul             v22.8h,  v7.8h,   v0.h[1]
+        mul             v23.8h,  v7.8h,   v0.h[0]
        addhn           v20.8b,  v20.8h,  v31.8h
        addhn           v21.8b,  v21.8h,  v31.8h
        addhn           v22.8b,  v22.8h,  v31.8h
@@ -1279,51 +1281,43 @@ function ipred_smooth_h_8bpc_neon, export=1
        b.gt            8b
        ret
 160:
-        // Set up pointers for four rows in parallel; x0, x6, x5, x10
-        add             x5,  x0,  x1
-        add             x10, x6,  x1
-        lsl             x1,  x1,  #1
-        mov             w9,  w3                   // x9 = uxtw(w3)
-        sub             x1,  x1,  w3,  uxtw
-1:
-        ldr             s0,  [x2, #-4]!           // left
-        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
-2:
-        ld1             {v7.16b}, [x8],   #16     // weights_hor
-        uxtl            v6.8h,   v7.8b            // weights_hor
-        uxtl2           v7.8h,   v7.16b
-        mul             v20.8h,  v6.8h,   v0.h[3] // right*256  + (left-right)*weights_hor
-        mul             v22.8h,  v6.8h,   v0.h[2]
-        mul             v21.8h,  v7.8h,   v0.h[3] // (left flipped)
-        mul             v23.8h,  v7.8h,   v0.h[2]
-        subs            w3,  w3,  #16
+        ldr             q7,  [x8],  #16           // weights_hor
+        mov             x12, x2
+        mov             w9,  w4
+        uxtl            v6.8h,   v7.8b            // weights_hor lower
+        uxtl2           v7.8h,   v7.16b           // weights_hor upper
+        mov             x5,  x0
+        add             x7,  x0,  x1,  lsl #1
+16:
+        ldr             s0,  [x12, #-4]!          // left
+        usubl           v0.8h,   v0.8b,   v5.8b
+        mul             v16.8h,  v6.8h,   v0.h[3] // right*256  + (left-right)*weights_hor
+        mul             v18.8h,  v6.8h,   v0.h[2] // (left flipped)
+        mul             v17.8h,  v7.8h,   v0.h[3] // right*256  + (left-right)*weights_hor
+        mul             v19.8h,  v7.8h,   v0.h[2] // (left flipped)
+        addhn           v16.8b,  v16.8h,  v31.8h
+        addhn           v18.8b,  v18.8h,  v31.8h
+        addhn2          v16.16b, v17.8h,  v31.8h
+        addhn2          v18.16b, v19.8h,  v31.8h
+        subs            w9,  w9,  #4
+        mul             v20.8h,  v6.8h,   v0.h[1]
+        mul             v22.8h,  v6.8h,   v0.h[0]
+        mul             v21.8h,  v7.8h,   v0.h[1]
+        mul             v23.8h,  v7.8h,   v0.h[0]
        addhn           v20.8b,  v20.8h,  v31.8h
        addhn           v22.8b,  v22.8h,  v31.8h
        addhn2          v20.16b, v21.8h,  v31.8h
        addhn2          v22.16b, v23.8h,  v31.8h
-        mul             v24.8h,  v6.8h,   v0.h[1]
-        mul             v26.8h,  v6.8h,   v0.h[0]
-        mul             v25.8h,  v7.8h,   v0.h[1]
-        mul             v27.8h,  v7.8h,   v0.h[0]
-        addhn           v24.8b,  v24.8h,  v31.8h
-        addhn           v26.8b,  v26.8h,  v31.8h
-        st1             {v20.16b}, [x0],  #16
-        addhn2          v24.16b, v25.8h,  v31.8h
-        st1             {v22.16b}, [x6],  #16
-        addhn2          v26.16b, v27.8h,  v31.8h
-        st1             {v24.16b}, [x5],  #16
-        st1             {v26.16b}, [x10], #16
-        b.gt            2b
-        subs            w4,  w4,  #4
-        b.le            9f
-        sub             x8,  x8,  x9
-        add             x0,  x0,  x1
-        add             x6,  x6,  x1
-        add             x5,  x5,  x1
-        add             x10, x10, x1
-        mov             w3,  w9
-        b               1b
-9:
+        str             q16, [x5]
+        str             q18, [x5,  x1]
+        add             x5,  x5,  x1,  lsl #2
+        str             q20, [x7]
+        str             q22, [x7,  x1]
+        add             x7,  x7,  x1,  lsl #2
+        b.gt            16b
+        sub             w3,  w3,  #16
+        add             x0,  x0,  #16
+        cbnz            w3,  160b
        ret
 endfunc