AArch64: Optimize ipred_smooth_v_8bpc_neon

Optimize ipred_smooth_v_8bpc_neon using simpler arithmetic operations
and the removal of jump table.

Relative runtime after this patch on some Cortex CPUs:

ipred_smooth_v:    w4      w8     w16     w32     w64
Cortex-A55:     1.025x  0.847x  0.821x  0.830x  0.852x
Cortex-A510:    1.017x  0.923x  0.915x  0.883x  0.840x
Cortex-A520:    1.080x  0.972x  0.999x  0.934x  0.876x
Cortex-A76:     0.818x  0.575x  0.599x  0.723x  0.744x
Cortex-A78:     0.782x  0.571x  0.595x  0.641x  0.685x
Cortex-A715:    0.801x  0.586x  0.593x  0.651x  0.694x
Cortex-A725:    0.801x  0.579x  0.596x  0.649x  0.692x
Cortex-X1:      0.782x  0.560x  0.553x  0.623x  0.682x
Cortex-X3:      0.792x  0.594x  0.526x  0.526x  0.604x
Cortex-X925:    0.757x  0.678x  0.525x  0.554x  0.577x
This commit is contained in:
Arpad Panyik
2026-05-06 20:18:03 +00:00
committed by Martin Storsjö
co-authored by Martin Storsjö
parent 4db1a05aad
commit 51b67010e2
+50 -85
View File
@@ -1119,125 +1119,98 @@ endjumptable
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_v_8bpc_neon, export=1
sub x8, x2, w4, uxtw
movrel x7, X(sm_weights)
add x7, x7, w4, uxtw
clz w9, w3
movrel x5, ipred_smooth_v_tbl
sub x8, x2, w4, uxtw
sub w9, w9, #25
ldrsw x9, [x5, w9, uxtw #2]
ld1r {v4.16b}, [x8] // bottom
ld1r {v4.16b}, [x8] // bottom
movi v31.8b, #128
add x2, x2, #1
add x5, x5, x9
add x7, x7, w4, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
zip1 v31.16b, v31.16b, v4.16b // bottom*256 + rnd
cmp w3, #8
b.gt 160f
b.eq 80f
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v6.2s}, [x2] // top
usubl v6.8h, v6.8b, v4.8b // top-bottom
4:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
shll v22.8h, v4.8b, #8 // bottom*256
shll v23.8h, v4.8b, #8
zip1 v16.2s, v16.2s, v17.2s // weights_ver
zip1 v18.2s, v18.2s, v19.2s
uxtl v16.8h, v16.8b // weights_ver
uxtl v18.8h, v18.8b
mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
mla v23.8h, v6.8h, v18.8h
rshrn v22.8b, v22.8h, #8
rshrn v23.8b, v23.8h, #8
ldr s7, [x7], #4 // weights_ver
uxtl v7.8h, v7.8b // weights_ver
zip1 v7.8h, v7.8h, v7.8h
zip1 v16.8h, v7.8h, v7.8h
zip2 v18.8h, v7.8h, v7.8h // splat weights_ver
mul v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
mul v23.8h, v6.8h, v18.8h
addhn v22.8b, v22.8h, v31.8h
addhn v23.8b, v23.8h, v31.8h
subs w4, w4, #4
st1 {v22.s}[0], [x0], x1
st1 {v22.s}[1], [x6], x1
subs w4, w4, #4
st1 {v23.s}[0], [x0], x1
st1 {v23.s}[1], [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v6.8b}, [x2] // top
ld1 {v6.8b}, [x2] // top
usubl v6.8h, v6.8b, v4.8b // top-bottom
8:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
shll v24.8h, v4.8b, #8 // bottom*256
shll v25.8h, v4.8b, #8
shll v26.8h, v4.8b, #8
shll v27.8h, v4.8b, #8
uxtl v16.8h, v16.8b // weights_ver
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
mla v25.8h, v6.8h, v17.8h
mla v26.8h, v6.8h, v18.8h
mla v27.8h, v6.8h, v19.8h
rshrn v24.8b, v24.8h, #8
rshrn v25.8b, v25.8h, #8
rshrn v26.8b, v26.8h, #8
rshrn v27.8b, v27.8h, #8
ldr s7, [x7], #4 // weights_ver
uxtl v7.8h, v7.8b // weights_ver
mul v24.8h, v6.8h, v7.h[0] // bottom*256 + (top-bottom)*weights_ver
mul v25.8h, v6.8h, v7.h[1]
mul v26.8h, v6.8h, v7.h[2]
mul v27.8h, v6.8h, v7.h[3]
addhn v24.8b, v24.8h, v31.8h
addhn v25.8b, v25.8h, v31.8h
addhn v26.8b, v26.8h, v31.8h
addhn v27.8b, v27.8h, v31.8h
subs w4, w4, #4
st1 {v24.8b}, [x0], x1
st1 {v25.8b}, [x6], x1
subs w4, w4, #4
st1 {v26.8b}, [x0], x1
st1 {v27.8b}, [x6], x1
b.gt 8b
ret
160:
320:
640:
AARCH64_VALID_JUMP_TARGET
// Set up pointers for four rows in parallel; x0, x6, x5, x8
add x5, x0, x1
add x8, x6, x1
lsl x1, x1, #1
mov w9, w3 // x9 = uxtw(w3)
sub x1, x1, w3, uxtw
mov w9, w3
1:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
uxtl v16.8h, v16.8b // weights_ver
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
ldr s7, [x7], #4 // weights_ver
uxtl v7.8h, v7.8b // weights_ver
2:
ld1 {v3.16b}, [x2], #16 // top
shll v20.8h, v4.8b, #8 // bottom*256
shll v21.8h, v4.8b, #8
shll v22.8h, v4.8b, #8
shll v23.8h, v4.8b, #8
shll v24.8h, v4.8b, #8
shll v25.8h, v4.8b, #8
shll v26.8h, v4.8b, #8
shll v27.8h, v4.8b, #8
usubl v2.8h, v3.8b, v4.8b // top-bottom
usubl2 v3.8h, v3.16b, v4.16b
mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
mla v21.8h, v3.8h, v16.8h
mla v22.8h, v2.8h, v17.8h
mla v23.8h, v3.8h, v17.8h
mla v24.8h, v2.8h, v18.8h
mla v25.8h, v3.8h, v18.8h
mla v26.8h, v2.8h, v19.8h
mla v27.8h, v3.8h, v19.8h
rshrn v20.8b, v20.8h, #8
rshrn2 v20.16b, v21.8h, #8
rshrn v22.8b, v22.8h, #8
rshrn2 v22.16b, v23.8h, #8
rshrn v24.8b, v24.8h, #8
rshrn2 v24.16b, v25.8h, #8
rshrn v26.8b, v26.8h, #8
rshrn2 v26.16b, v27.8h, #8
mul v20.8h, v2.8h, v7.h[0] // bottom*256 + (top-bottom)*weights_ver
mul v22.8h, v2.8h, v7.h[1]
mul v21.8h, v3.8h, v7.h[0]
mul v23.8h, v3.8h, v7.h[1]
subs w3, w3, #16
addhn v20.8b, v20.8h, v31.8h
addhn v22.8b, v22.8h, v31.8h
addhn2 v20.16b, v21.8h, v31.8h
addhn2 v22.16b, v23.8h, v31.8h
mul v24.8h, v2.8h, v7.h[2]
mul v26.8h, v2.8h, v7.h[3]
mul v25.8h, v3.8h, v7.h[2]
mul v27.8h, v3.8h, v7.h[3]
addhn v24.8b, v24.8h, v31.8h
addhn v26.8b, v26.8h, v31.8h
st1 {v20.16b}, [x0], #16
addhn2 v24.16b, v25.8h, v31.8h
st1 {v22.16b}, [x6], #16
addhn2 v26.16b, v27.8h, v31.8h
st1 {v24.16b}, [x5], #16
st1 {v26.16b}, [x8], #16
b.gt 2b
subs w4, w4, #4
b.le 9f
sub x2, x2, w9, uxtw
sub x2, x2, x9
add x0, x0, x1
add x6, x6, x1
add x5, x5, x1
@@ -1248,14 +1221,6 @@ function ipred_smooth_v_8bpc_neon, export=1
ret
endfunc
jumptable ipred_smooth_v_tbl
.word 640b - ipred_smooth_v_tbl
.word 320b - ipred_smooth_v_tbl
.word 160b - ipred_smooth_v_tbl
.word 80b - ipred_smooth_v_tbl
.word 40b - ipred_smooth_v_tbl
endjumptable
// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,