AArch64: Optimize ipred_smooth_h_8bpc_neon

Optimize ipred_smooth_h_8bpc_neon using simpler arithmetic operations.

Relative runtime after this patch on some Cortex CPUs:

ipred_smooth_h:    w4      w8     w16     w32     w64
Cortex-A55:     1.015x  0.857x  0.819x  0.835x  0.862x
Cortex-A510:    0.988x  0.860x  0.915x  0.879x  0.837x
Cortex-A520:    0.999x  0.883x  0.967x  0.929x  0.873x
Cortex-A76:     0.804x  0.637x  0.517x  0.573x  0.613x
Cortex-A78:     0.800x  0.586x  0.548x  0.639x  0.640x
Cortex-A715:    0.722x  0.642x  0.563x  0.627x  0.646x
Cortex-A725:    0.710x  0.639x  0.567x  0.622x  0.645x
Cortex-X1:      0.758x  0.570x  0.565x  0.548x  0.557x
Cortex-X3:      0.789x  0.589x  0.528x  0.563x  0.571x
Cortex-X925:    0.855x  0.739x  0.541x  0.551x  0.567x
This commit is contained in:
Arpad Panyik
2026-05-06 20:18:03 +00:00
committed by Martin Storsjö
co-authored by Martin Storsjö
parent 037430193a
commit 4db1a05aad
+53 -94
View File
@@ -1261,130 +1261,97 @@ endjumptable
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_h_8bpc_neon, export=1
add x12, x2, w3, uxtw
movrel x8, X(sm_weights)
add x8, x8, w3, uxtw
clz w9, w3
movrel x5, ipred_smooth_h_tbl
add x12, x2, w3, uxtw
sub w9, w9, #25
ldrsw x9, [x5, w9, uxtw #2]
ld1r {v5.16b}, [x12] // right
add x5, x5, x9
ld1r {v5.16b}, [x12] // right
movi v31.8b, #128
add x8, x8, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
zip1 v31.16b, v31.16b, v5.16b // right*256 + rnd
cmp w3, #8
b.gt 160f
b.eq 80f
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v7.2s}, [x8] // weights_hor
sub x2, x2, #4
mov x7, #-4
ld1r {v7.2s}, [x8] // weights_hor
uxtl v7.8h, v7.8b // weights_hor
4:
ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
shll v20.8h, v5.8b, #8 // right*256
shll v21.8h, v5.8b, #8
zip1 v1.2s, v1.2s, v0.2s // left, flipped
zip1 v0.2s, v3.2s, v2.2s
usubl v0.8h, v0.8b, v5.8b // left-right
usubl v1.8h, v1.8b, v5.8b
mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
mla v21.8h, v1.8h, v7.8h
rshrn v20.8b, v20.8h, #8
rshrn v21.8b, v21.8h, #8
st1 {v20.s}[0], [x0], x1
st1 {v20.s}[1], [x6], x1
ldr s0, [x2, #-4]! // left
zip1 v0.8b, v0.8b, v0.8b
zip1 v0.16b, v0.16b, v0.16b // replicate left[1..4]
usubl v21.8h, v0.8b, v5.8b
usubl2 v20.8h, v0.16b, v5.16b // left-right
mul v21.8h, v21.8h, v7.8h
mul v20.8h, v20.8h, v7.8h // right*256 + (left-right)*weights_hor
addhn v21.8b, v21.8h, v31.8h
addhn v20.8b, v20.8h, v31.8h
subs w4, w4, #4
st1 {v21.s}[0], [x0], x1
st1 {v21.s}[1], [x6], x1
st1 {v20.s}[1], [x0], x1
st1 {v20.s}[0], [x6], x1
st1 {v21.s}[1], [x0], x1
st1 {v21.s}[0], [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v7.8b}, [x8] // weights_hor
sub x2, x2, #4
mov x7, #-4
ld1 {v7.8b}, [x8] // weights_hor
uxtl v7.8h, v7.8b // weights_hor
8:
ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
shll v20.8h, v5.8b, #8 // right*256
shll v21.8h, v5.8b, #8
shll v22.8h, v5.8b, #8
shll v23.8h, v5.8b, #8
usubl v3.8h, v3.8b, v5.8b // left-right
usubl v2.8h, v2.8b, v5.8b
usubl v1.8h, v1.8b, v5.8b
ldr s0, [x2, #-4]! // left
usubl v0.8h, v0.8b, v5.8b
mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
mla v21.8h, v2.8h, v7.8h // (left flipped)
mla v22.8h, v1.8h, v7.8h
mla v23.8h, v0.8h, v7.8h
rshrn v20.8b, v20.8h, #8
rshrn v21.8b, v21.8h, #8
rshrn v22.8b, v22.8h, #8
rshrn v23.8b, v23.8h, #8
mul v20.8h, v7.8h, v0.h[3] // right*256 + (left-right)*weights_hor
mul v21.8h, v7.8h, v0.h[2] // (left flipped)
mul v22.8h, v7.8h, v0.h[1]
mul v23.8h, v7.8h, v0.h[0]
addhn v20.8b, v20.8h, v31.8h
addhn v21.8b, v21.8h, v31.8h
addhn v22.8b, v22.8h, v31.8h
addhn v23.8b, v23.8h, v31.8h
subs w4, w4, #4
st1 {v20.8b}, [x0], x1
st1 {v21.8b}, [x6], x1
subs w4, w4, #4
st1 {v22.8b}, [x0], x1
st1 {v23.8b}, [x6], x1
b.gt 8b
ret
160:
320:
640:
AARCH64_VALID_JUMP_TARGET
sub x2, x2, #4
mov x7, #-4
// Set up pointers for four rows in parallel; x0, x6, x5, x10
add x5, x0, x1
add x10, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw
mov w9, w3
mov w9, w3 // x9 = uxtw(w3)
sub x1, x1, w3, uxtw
1:
ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
ldr s0, [x2, #-4]! // left
usubl v0.8h, v0.8b, v5.8b // left-right
usubl v1.8h, v1.8b, v5.8b
usubl v2.8h, v2.8b, v5.8b
usubl v3.8h, v3.8b, v5.8b
2:
ld1 {v7.16b}, [x8], #16 // weights_hor
shll v20.8h, v5.8b, #8 // right*256
shll v21.8h, v5.8b, #8
shll v22.8h, v5.8b, #8
shll v23.8h, v5.8b, #8
shll v24.8h, v5.8b, #8
shll v25.8h, v5.8b, #8
shll v26.8h, v5.8b, #8
shll v27.8h, v5.8b, #8
uxtl v6.8h, v7.8b // weights_hor
uxtl2 v7.8h, v7.16b
mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor
mla v21.8h, v3.8h, v7.8h // (left flipped)
mla v22.8h, v2.8h, v6.8h
mla v23.8h, v2.8h, v7.8h
mla v24.8h, v1.8h, v6.8h
mla v25.8h, v1.8h, v7.8h
mla v26.8h, v0.8h, v6.8h
mla v27.8h, v0.8h, v7.8h
rshrn v20.8b, v20.8h, #8
rshrn2 v20.16b, v21.8h, #8
rshrn v22.8b, v22.8h, #8
rshrn2 v22.16b, v23.8h, #8
rshrn v24.8b, v24.8h, #8
rshrn2 v24.16b, v25.8h, #8
rshrn v26.8b, v26.8h, #8
rshrn2 v26.16b, v27.8h, #8
mul v20.8h, v6.8h, v0.h[3] // right*256 + (left-right)*weights_hor
mul v22.8h, v6.8h, v0.h[2]
mul v21.8h, v7.8h, v0.h[3] // (left flipped)
mul v23.8h, v7.8h, v0.h[2]
subs w3, w3, #16
addhn v20.8b, v20.8h, v31.8h
addhn v22.8b, v22.8h, v31.8h
addhn2 v20.16b, v21.8h, v31.8h
addhn2 v22.16b, v23.8h, v31.8h
mul v24.8h, v6.8h, v0.h[1]
mul v26.8h, v6.8h, v0.h[0]
mul v25.8h, v7.8h, v0.h[1]
mul v27.8h, v7.8h, v0.h[0]
addhn v24.8b, v24.8h, v31.8h
addhn v26.8b, v26.8h, v31.8h
st1 {v20.16b}, [x0], #16
addhn2 v24.16b, v25.8h, v31.8h
st1 {v22.16b}, [x6], #16
addhn2 v26.16b, v27.8h, v31.8h
st1 {v24.16b}, [x5], #16
st1 {v26.16b}, [x10], #16
b.gt 2b
subs w4, w4, #4
b.le 9f
sub x8, x8, w9, uxtw
sub x8, x8, x9
add x0, x0, x1
add x6, x6, x1
add x5, x5, x1
@@ -1395,14 +1362,6 @@ function ipred_smooth_h_8bpc_neon, export=1
ret
endfunc
jumptable ipred_smooth_h_tbl
.word 640b - ipred_smooth_h_tbl
.word 320b - ipred_smooth_h_tbl
.word 160b - ipred_smooth_h_tbl
.word 80b - ipred_smooth_h_tbl
.word 40b - ipred_smooth_h_tbl
endjumptable
const padding_mask_buf
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00