AArch64: Optimize ipred_smooth_h_8bpc_neon

Optimize ipred_smooth_h_8bpc_neon using simpler arithmetic operations. Relative runtime after this patch on some Cortex CPUs: ipred_smooth_h: w4 w8 w16 w32 w64 Cortex-A55: 1.015x 0.857x 0.819x 0.835x 0.862x Cortex-A510: 0.988x 0.860x 0.915x 0.879x 0.837x Cortex-A520: 0.999x 0.883x 0.967x 0.929x 0.873x Cortex-A76: 0.804x 0.637x 0.517x 0.573x 0.613x Cortex-A78: 0.800x 0.586x 0.548x 0.639x 0.640x Cortex-A715: 0.722x 0.642x 0.563x 0.627x 0.646x Cortex-A725: 0.710x 0.639x 0.567x 0.622x 0.645x Cortex-X1: 0.758x 0.570x 0.565x 0.548x 0.557x Cortex-X3: 0.789x 0.589x 0.528x 0.563x 0.571x Cortex-X925: 0.855x 0.739x 0.541x 0.551x 0.567x
2026-06-11 04:03:05 +00:00 · 2026-05-06 20:18:03 +00:00
parent 037430193a
commit 4db1a05aad
1 changed files with 53 additions and 94 deletions
@@ -1261,130 +1261,97 @@ endjumptable
 //                               const int width, const int height, const int a,
 //                               const int max_width, const int max_height);
 function ipred_smooth_h_8bpc_neon, export=1
+        add             x12, x2,  w3,  uxtw
        movrel          x8,  X(sm_weights)
-        add             x8,  x8,  w3, uxtw
-        clz             w9,  w3
-        movrel          x5,  ipred_smooth_h_tbl
-        add             x12, x2,  w3, uxtw
-        sub             w9,  w9,  #25
-        ldrsw           x9,  [x5, w9, uxtw #2]
-        ld1r            {v5.16b},  [x12] // right
-        add             x5,  x5,  x9
+        ld1r            {v5.16b},  [x12]          // right
+        movi            v31.8b,  #128
+        add             x8,  x8,  w3,  uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
-        br              x5
+        zip1            v31.16b, v31.16b, v5.16b  // right*256 + rnd
+        cmp             w3,  #8
+        b.gt            160f
+        b.eq            80f
 40:
-        AARCH64_VALID_JUMP_TARGET
-        ld1r            {v7.2s}, [x8]             // weights_hor
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
+        ld1r            {v7.2s},  [x8]            // weights_hor
        uxtl            v7.8h,   v7.8b            // weights_hor
 4:
-        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
-        shll            v20.8h,  v5.8b,   #8      // right*256
-        shll            v21.8h,  v5.8b,   #8
-        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
-        zip1            v0.2s,   v3.2s,   v2.2s
-        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
-        usubl           v1.8h,   v1.8b,   v5.8b
-        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
-        mla             v21.8h,  v1.8h,   v7.8h
-        rshrn           v20.8b,  v20.8h,  #8
-        rshrn           v21.8b,  v21.8h,  #8
-        st1             {v20.s}[0], [x0], x1
-        st1             {v20.s}[1], [x6], x1
+        ldr             s0,  [x2, #-4]!           // left
+        zip1            v0.8b,   v0.8b,   v0.8b
+        zip1            v0.16b,  v0.16b,  v0.16b  // replicate left[1..4]
+        usubl           v21.8h,  v0.8b,   v5.8b
+        usubl2          v20.8h,  v0.16b,  v5.16b  // left-right
+        mul             v21.8h,  v21.8h,  v7.8h
+        mul             v20.8h,  v20.8h,  v7.8h   // right*256  + (left-right)*weights_hor
+        addhn           v21.8b,  v21.8h,  v31.8h
+        addhn           v20.8b,  v20.8h,  v31.8h
        subs            w4,  w4,  #4
-        st1             {v21.s}[0], [x0], x1
-        st1             {v21.s}[1], [x6], x1
+        st1             {v20.s}[1], [x0], x1
+        st1             {v20.s}[0], [x6], x1
+        st1             {v21.s}[1], [x0], x1
+        st1             {v21.s}[0], [x6], x1
        b.gt            4b
        ret
 80:
-        AARCH64_VALID_JUMP_TARGET
-        ld1             {v7.8b}, [x8]             // weights_hor
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
+        ld1             {v7.8b},  [x8]            // weights_hor
        uxtl            v7.8h,   v7.8b            // weights_hor
 8:
-        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
-        shll            v20.8h,  v5.8b,   #8      // right*256
-        shll            v21.8h,  v5.8b,   #8
-        shll            v22.8h,  v5.8b,   #8
-        shll            v23.8h,  v5.8b,   #8
-        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
-        usubl           v2.8h,   v2.8b,   v5.8b
-        usubl           v1.8h,   v1.8b,   v5.8b
+        ldr             s0,  [x2, #-4]!           // left
        usubl           v0.8h,   v0.8b,   v5.8b
-        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
-        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
-        mla             v22.8h,  v1.8h,   v7.8h
-        mla             v23.8h,  v0.8h,   v7.8h
-        rshrn           v20.8b,  v20.8h,  #8
-        rshrn           v21.8b,  v21.8h,  #8
-        rshrn           v22.8b,  v22.8h,  #8
-        rshrn           v23.8b,  v23.8h,  #8
+        mul             v20.8h,   v7.8h,  v0.h[3] // right*256  + (left-right)*weights_hor
+        mul             v21.8h,   v7.8h,  v0.h[2] // (left flipped)
+        mul             v22.8h,   v7.8h,  v0.h[1]
+        mul             v23.8h,   v7.8h,  v0.h[0]
+        addhn           v20.8b,  v20.8h,  v31.8h
+        addhn           v21.8b,  v21.8h,  v31.8h
+        addhn           v22.8b,  v22.8h,  v31.8h
+        addhn           v23.8b,  v23.8h,  v31.8h
+        subs            w4,  w4,  #4
        st1             {v20.8b}, [x0], x1
        st1             {v21.8b}, [x6], x1
-        subs            w4,  w4,  #4
        st1             {v22.8b}, [x0], x1
        st1             {v23.8b}, [x6], x1
        b.gt            8b
        ret
 160:
-320:
-640:
-        AARCH64_VALID_JUMP_TARGET
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
        // Set up pointers for four rows in parallel; x0, x6, x5, x10
        add             x5,  x0,  x1
        add             x10, x6,  x1
        lsl             x1,  x1,  #1
-        sub             x1,  x1,  w3, uxtw
-        mov             w9,  w3
-
+        mov             w9,  w3                   // x9 = uxtw(w3)
+        sub             x1,  x1,  w3,  uxtw
 1:
-        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
+        ldr             s0,  [x2, #-4]!           // left
        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
-        usubl           v1.8h,   v1.8b,   v5.8b
-        usubl           v2.8h,   v2.8b,   v5.8b
-        usubl           v3.8h,   v3.8b,   v5.8b
 2:
        ld1             {v7.16b}, [x8],   #16     // weights_hor
-        shll            v20.8h,  v5.8b,   #8      // right*256
-        shll            v21.8h,  v5.8b,   #8
-        shll            v22.8h,  v5.8b,   #8
-        shll            v23.8h,  v5.8b,   #8
-        shll            v24.8h,  v5.8b,   #8
-        shll            v25.8h,  v5.8b,   #8
-        shll            v26.8h,  v5.8b,   #8
-        shll            v27.8h,  v5.8b,   #8
        uxtl            v6.8h,   v7.8b            // weights_hor
        uxtl2           v7.8h,   v7.16b
-        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
-        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
-        mla             v22.8h,  v2.8h,   v6.8h
-        mla             v23.8h,  v2.8h,   v7.8h
-        mla             v24.8h,  v1.8h,   v6.8h
-        mla             v25.8h,  v1.8h,   v7.8h
-        mla             v26.8h,  v0.8h,   v6.8h
-        mla             v27.8h,  v0.8h,   v7.8h
-        rshrn           v20.8b,  v20.8h,  #8
-        rshrn2          v20.16b, v21.8h,  #8
-        rshrn           v22.8b,  v22.8h,  #8
-        rshrn2          v22.16b, v23.8h,  #8
-        rshrn           v24.8b,  v24.8h,  #8
-        rshrn2          v24.16b, v25.8h,  #8
-        rshrn           v26.8b,  v26.8h,  #8
-        rshrn2          v26.16b, v27.8h,  #8
+        mul             v20.8h,  v6.8h,   v0.h[3] // right*256  + (left-right)*weights_hor
+        mul             v22.8h,  v6.8h,   v0.h[2]
+        mul             v21.8h,  v7.8h,   v0.h[3] // (left flipped)
+        mul             v23.8h,  v7.8h,   v0.h[2]
        subs            w3,  w3,  #16
+        addhn           v20.8b,  v20.8h,  v31.8h
+        addhn           v22.8b,  v22.8h,  v31.8h
+        addhn2          v20.16b, v21.8h,  v31.8h
+        addhn2          v22.16b, v23.8h,  v31.8h
+        mul             v24.8h,  v6.8h,   v0.h[1]
+        mul             v26.8h,  v6.8h,   v0.h[0]
+        mul             v25.8h,  v7.8h,   v0.h[1]
+        mul             v27.8h,  v7.8h,   v0.h[0]
+        addhn           v24.8b,  v24.8h,  v31.8h
+        addhn           v26.8b,  v26.8h,  v31.8h
        st1             {v20.16b}, [x0],  #16
+        addhn2          v24.16b, v25.8h,  v31.8h
        st1             {v22.16b}, [x6],  #16
+        addhn2          v26.16b, v27.8h,  v31.8h
        st1             {v24.16b}, [x5],  #16
        st1             {v26.16b}, [x10], #16
        b.gt            2b
        subs            w4,  w4,  #4
        b.le            9f
-        sub             x8,  x8,  w9, uxtw
+        sub             x8,  x8,  x9
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        add             x5,  x5,  x1
@@ -1395,14 +1362,6 @@ function ipred_smooth_h_8bpc_neon, export=1
        ret
 endfunc

-jumptable ipred_smooth_h_tbl
-        .word 640b - ipred_smooth_h_tbl
-        .word 320b - ipred_smooth_h_tbl
-        .word 160b - ipred_smooth_h_tbl
-        .word 80b  - ipred_smooth_h_tbl
-        .word 40b  - ipred_smooth_h_tbl
-endjumptable
-
 const padding_mask_buf
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00