AArch64: Optimize ipred_smooth_v_8bpc_neon

Optimize ipred_smooth_v_8bpc_neon using simpler arithmetic operations and the removal of jump table. Relative runtime after this patch on some Cortex CPUs: ipred_smooth_v: w4 w8 w16 w32 w64 Cortex-A55: 1.025x 0.847x 0.821x 0.830x 0.852x Cortex-A510: 1.017x 0.923x 0.915x 0.883x 0.840x Cortex-A520: 1.080x 0.972x 0.999x 0.934x 0.876x Cortex-A76: 0.818x 0.575x 0.599x 0.723x 0.744x Cortex-A78: 0.782x 0.571x 0.595x 0.641x 0.685x Cortex-A715: 0.801x 0.586x 0.593x 0.651x 0.694x Cortex-A725: 0.801x 0.579x 0.596x 0.649x 0.692x Cortex-X1: 0.782x 0.560x 0.553x 0.623x 0.682x Cortex-X3: 0.792x 0.594x 0.526x 0.526x 0.604x Cortex-X925: 0.757x 0.678x 0.525x 0.554x 0.577x
2026-06-11 04:03:05 +00:00 · 2026-05-06 20:18:03 +00:00
parent 4db1a05aad
commit 51b67010e2
1 changed files with 50 additions and 85 deletions
@@ -1119,125 +1119,98 @@ endjumptable
 //                               const int width, const int height, const int a,
 //                               const int max_width, const int max_height);
 function ipred_smooth_v_8bpc_neon, export=1
+        sub             x8,  x2,  w4,  uxtw
        movrel          x7,  X(sm_weights)
-        add             x7,  x7,  w4, uxtw
-        clz             w9,  w3
-        movrel          x5,  ipred_smooth_v_tbl
-        sub             x8,  x2,  w4, uxtw
-        sub             w9,  w9,  #25
-        ldrsw           x9,  [x5, w9, uxtw #2]
-        ld1r            {v4.16b},  [x8] // bottom
+        ld1r            {v4.16b},  [x8]           // bottom
+        movi            v31.8b,  #128
        add             x2,  x2,  #1
-        add             x5,  x5,  x9
+        add             x7,  x7,  w4,  uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
-        br              x5
+        zip1            v31.16b, v31.16b, v4.16b  // bottom*256 + rnd
+        cmp             w3,  #8
+        b.gt            160f
+        b.eq            80f
 40:
-        AARCH64_VALID_JUMP_TARGET
        ld1r            {v6.2s}, [x2]             // top
        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
 4:
-        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
-        shll            v22.8h,  v4.8b,   #8      // bottom*256
-        shll            v23.8h,  v4.8b,   #8
-        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
-        zip1            v18.2s,  v18.2s,  v19.2s
-        uxtl            v16.8h,  v16.8b           // weights_ver
-        uxtl            v18.8h,  v18.8b
-        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
-        mla             v23.8h,  v6.8h,   v18.8h
-        rshrn           v22.8b,  v22.8h,  #8
-        rshrn           v23.8b,  v23.8h,  #8
+        ldr             s7,  [x7],  #4            // weights_ver
+        uxtl            v7.8h,   v7.8b            // weights_ver
+        zip1            v7.8h,   v7.8h,   v7.8h
+        zip1            v16.8h,  v7.8h,   v7.8h
+        zip2            v18.8h,  v7.8h,   v7.8h   // splat weights_ver
+        mul             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
+        mul             v23.8h,  v6.8h,   v18.8h
+        addhn           v22.8b,  v22.8h,  v31.8h
+        addhn           v23.8b,  v23.8h,  v31.8h
+        subs            w4,  w4,  #4
        st1             {v22.s}[0], [x0], x1
        st1             {v22.s}[1], [x6], x1
-        subs            w4,  w4,  #4
        st1             {v23.s}[0], [x0], x1
        st1             {v23.s}[1], [x6], x1
        b.gt            4b
        ret
 80:
-        AARCH64_VALID_JUMP_TARGET
-        ld1             {v6.8b}, [x2]             // top
+        ld1             {v6.8b},  [x2]            // top
        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
 8:
-        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
-        shll            v24.8h,  v4.8b,   #8      // bottom*256
-        shll            v25.8h,  v4.8b,   #8
-        shll            v26.8h,  v4.8b,   #8
-        shll            v27.8h,  v4.8b,   #8
-        uxtl            v16.8h,  v16.8b           // weights_ver
-        uxtl            v17.8h,  v17.8b
-        uxtl            v18.8h,  v18.8b
-        uxtl            v19.8h,  v19.8b
-        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
-        mla             v25.8h,  v6.8h,   v17.8h
-        mla             v26.8h,  v6.8h,   v18.8h
-        mla             v27.8h,  v6.8h,   v19.8h
-        rshrn           v24.8b,  v24.8h,  #8
-        rshrn           v25.8b,  v25.8h,  #8
-        rshrn           v26.8b,  v26.8h,  #8
-        rshrn           v27.8b,  v27.8h,  #8
+        ldr             s7,  [x7],  #4            // weights_ver
+        uxtl            v7.8h,   v7.8b            // weights_ver
+        mul             v24.8h,  v6.8h,   v7.h[0] // bottom*256 + (top-bottom)*weights_ver
+        mul             v25.8h,  v6.8h,   v7.h[1]
+        mul             v26.8h,  v6.8h,   v7.h[2]
+        mul             v27.8h,  v6.8h,   v7.h[3]
+        addhn           v24.8b,  v24.8h,  v31.8h
+        addhn           v25.8b,  v25.8h,  v31.8h
+        addhn           v26.8b,  v26.8h,  v31.8h
+        addhn           v27.8b,  v27.8h,  v31.8h
+        subs            w4,  w4,  #4
        st1             {v24.8b}, [x0], x1
        st1             {v25.8b}, [x6], x1
-        subs            w4,  w4,  #4
        st1             {v26.8b}, [x0], x1
        st1             {v27.8b}, [x6], x1
        b.gt            8b
        ret
 160:
-320:
-640:
-        AARCH64_VALID_JUMP_TARGET
        // Set up pointers for four rows in parallel; x0, x6, x5, x8
        add             x5,  x0,  x1
        add             x8,  x6,  x1
        lsl             x1,  x1,  #1
+        mov             w9,  w3                   // x9 = uxtw(w3)
        sub             x1,  x1,  w3, uxtw
-        mov             w9,  w3
-
 1:
-        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
-        uxtl            v16.8h,  v16.8b           // weights_ver
-        uxtl            v17.8h,  v17.8b
-        uxtl            v18.8h,  v18.8b
-        uxtl            v19.8h,  v19.8b
+        ldr             s7,  [x7],  #4            // weights_ver
+        uxtl            v7.8h,   v7.8b            // weights_ver
 2:
        ld1             {v3.16b}, [x2],   #16     // top
-        shll            v20.8h,  v4.8b,   #8      // bottom*256
-        shll            v21.8h,  v4.8b,   #8
-        shll            v22.8h,  v4.8b,   #8
-        shll            v23.8h,  v4.8b,   #8
-        shll            v24.8h,  v4.8b,   #8
-        shll            v25.8h,  v4.8b,   #8
-        shll            v26.8h,  v4.8b,   #8
-        shll            v27.8h,  v4.8b,   #8
        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
        usubl2          v3.8h,   v3.16b,  v4.16b
-        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
-        mla             v21.8h,  v3.8h,   v16.8h
-        mla             v22.8h,  v2.8h,   v17.8h
-        mla             v23.8h,  v3.8h,   v17.8h
-        mla             v24.8h,  v2.8h,   v18.8h
-        mla             v25.8h,  v3.8h,   v18.8h
-        mla             v26.8h,  v2.8h,   v19.8h
-        mla             v27.8h,  v3.8h,   v19.8h
-        rshrn           v20.8b,  v20.8h,  #8
-        rshrn2          v20.16b, v21.8h,  #8
-        rshrn           v22.8b,  v22.8h,  #8
-        rshrn2          v22.16b, v23.8h,  #8
-        rshrn           v24.8b,  v24.8h,  #8
-        rshrn2          v24.16b, v25.8h,  #8
-        rshrn           v26.8b,  v26.8h,  #8
-        rshrn2          v26.16b, v27.8h,  #8
+        mul             v20.8h,  v2.8h,   v7.h[0] // bottom*256 + (top-bottom)*weights_ver
+        mul             v22.8h,  v2.8h,   v7.h[1]
+        mul             v21.8h,  v3.8h,   v7.h[0]
+        mul             v23.8h,  v3.8h,   v7.h[1]
        subs            w3,  w3,  #16
+        addhn           v20.8b,  v20.8h,  v31.8h
+        addhn           v22.8b,  v22.8h,  v31.8h
+        addhn2          v20.16b, v21.8h,  v31.8h
+        addhn2          v22.16b, v23.8h,  v31.8h
+        mul             v24.8h,  v2.8h,   v7.h[2]
+        mul             v26.8h,  v2.8h,   v7.h[3]
+        mul             v25.8h,  v3.8h,   v7.h[2]
+        mul             v27.8h,  v3.8h,   v7.h[3]
+        addhn           v24.8b,  v24.8h,  v31.8h
+        addhn           v26.8b,  v26.8h,  v31.8h
        st1             {v20.16b}, [x0],  #16
+        addhn2          v24.16b, v25.8h,  v31.8h
        st1             {v22.16b}, [x6],  #16
+        addhn2          v26.16b, v27.8h,  v31.8h
        st1             {v24.16b}, [x5],  #16
        st1             {v26.16b}, [x8],  #16
        b.gt            2b
        subs            w4,  w4,  #4
        b.le            9f
-        sub             x2,  x2,  w9, uxtw
+        sub             x2,  x2,  x9
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        add             x5,  x5,  x1
@@ -1248,14 +1221,6 @@ function ipred_smooth_v_8bpc_neon, export=1
        ret
 endfunc

-jumptable ipred_smooth_v_tbl
-        .word 640b - ipred_smooth_v_tbl
-        .word 320b - ipred_smooth_v_tbl
-        .word 160b - ipred_smooth_v_tbl
-        .word 80b  - ipred_smooth_v_tbl
-        .word 40b  - ipred_smooth_v_tbl
-endjumptable
-
 // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height, const int a,