mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
AArch64: Optimize ipred_smooth_h_8bpc_neon
Optimize ipred_smooth_h_8bpc_neon using simpler arithmetic operations. Relative runtime after this patch on some Cortex CPUs: ipred_smooth_h: w4 w8 w16 w32 w64 Cortex-A55: 1.015x 0.857x 0.819x 0.835x 0.862x Cortex-A510: 0.988x 0.860x 0.915x 0.879x 0.837x Cortex-A520: 0.999x 0.883x 0.967x 0.929x 0.873x Cortex-A76: 0.804x 0.637x 0.517x 0.573x 0.613x Cortex-A78: 0.800x 0.586x 0.548x 0.639x 0.640x Cortex-A715: 0.722x 0.642x 0.563x 0.627x 0.646x Cortex-A725: 0.710x 0.639x 0.567x 0.622x 0.645x Cortex-X1: 0.758x 0.570x 0.565x 0.548x 0.557x Cortex-X3: 0.789x 0.589x 0.528x 0.563x 0.571x Cortex-X925: 0.855x 0.739x 0.541x 0.551x 0.567x
This commit is contained in:
committed by
Martin Storsjö
co-authored by
Martin Storsjö
parent
037430193a
commit
4db1a05aad
+53
-94
@@ -1261,130 +1261,97 @@ endjumptable
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_smooth_h_8bpc_neon, export=1
|
||||
add x12, x2, w3, uxtw
|
||||
movrel x8, X(sm_weights)
|
||||
add x8, x8, w3, uxtw
|
||||
clz w9, w3
|
||||
movrel x5, ipred_smooth_h_tbl
|
||||
add x12, x2, w3, uxtw
|
||||
sub w9, w9, #25
|
||||
ldrsw x9, [x5, w9, uxtw #2]
|
||||
ld1r {v5.16b}, [x12] // right
|
||||
add x5, x5, x9
|
||||
ld1r {v5.16b}, [x12] // right
|
||||
movi v31.8b, #128
|
||||
add x8, x8, w3, uxtw
|
||||
add x6, x0, x1
|
||||
lsl x1, x1, #1
|
||||
br x5
|
||||
zip1 v31.16b, v31.16b, v5.16b // right*256 + rnd
|
||||
cmp w3, #8
|
||||
b.gt 160f
|
||||
b.eq 80f
|
||||
40:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1r {v7.2s}, [x8] // weights_hor
|
||||
sub x2, x2, #4
|
||||
mov x7, #-4
|
||||
ld1r {v7.2s}, [x8] // weights_hor
|
||||
uxtl v7.8h, v7.8b // weights_hor
|
||||
4:
|
||||
ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
|
||||
shll v20.8h, v5.8b, #8 // right*256
|
||||
shll v21.8h, v5.8b, #8
|
||||
zip1 v1.2s, v1.2s, v0.2s // left, flipped
|
||||
zip1 v0.2s, v3.2s, v2.2s
|
||||
usubl v0.8h, v0.8b, v5.8b // left-right
|
||||
usubl v1.8h, v1.8b, v5.8b
|
||||
mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
|
||||
mla v21.8h, v1.8h, v7.8h
|
||||
rshrn v20.8b, v20.8h, #8
|
||||
rshrn v21.8b, v21.8h, #8
|
||||
st1 {v20.s}[0], [x0], x1
|
||||
st1 {v20.s}[1], [x6], x1
|
||||
ldr s0, [x2, #-4]! // left
|
||||
zip1 v0.8b, v0.8b, v0.8b
|
||||
zip1 v0.16b, v0.16b, v0.16b // replicate left[1..4]
|
||||
usubl v21.8h, v0.8b, v5.8b
|
||||
usubl2 v20.8h, v0.16b, v5.16b // left-right
|
||||
mul v21.8h, v21.8h, v7.8h
|
||||
mul v20.8h, v20.8h, v7.8h // right*256 + (left-right)*weights_hor
|
||||
addhn v21.8b, v21.8h, v31.8h
|
||||
addhn v20.8b, v20.8h, v31.8h
|
||||
subs w4, w4, #4
|
||||
st1 {v21.s}[0], [x0], x1
|
||||
st1 {v21.s}[1], [x6], x1
|
||||
st1 {v20.s}[1], [x0], x1
|
||||
st1 {v20.s}[0], [x6], x1
|
||||
st1 {v21.s}[1], [x0], x1
|
||||
st1 {v21.s}[0], [x6], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
80:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v7.8b}, [x8] // weights_hor
|
||||
sub x2, x2, #4
|
||||
mov x7, #-4
|
||||
ld1 {v7.8b}, [x8] // weights_hor
|
||||
uxtl v7.8h, v7.8b // weights_hor
|
||||
8:
|
||||
ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
|
||||
shll v20.8h, v5.8b, #8 // right*256
|
||||
shll v21.8h, v5.8b, #8
|
||||
shll v22.8h, v5.8b, #8
|
||||
shll v23.8h, v5.8b, #8
|
||||
usubl v3.8h, v3.8b, v5.8b // left-right
|
||||
usubl v2.8h, v2.8b, v5.8b
|
||||
usubl v1.8h, v1.8b, v5.8b
|
||||
ldr s0, [x2, #-4]! // left
|
||||
usubl v0.8h, v0.8b, v5.8b
|
||||
mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
|
||||
mla v21.8h, v2.8h, v7.8h // (left flipped)
|
||||
mla v22.8h, v1.8h, v7.8h
|
||||
mla v23.8h, v0.8h, v7.8h
|
||||
rshrn v20.8b, v20.8h, #8
|
||||
rshrn v21.8b, v21.8h, #8
|
||||
rshrn v22.8b, v22.8h, #8
|
||||
rshrn v23.8b, v23.8h, #8
|
||||
mul v20.8h, v7.8h, v0.h[3] // right*256 + (left-right)*weights_hor
|
||||
mul v21.8h, v7.8h, v0.h[2] // (left flipped)
|
||||
mul v22.8h, v7.8h, v0.h[1]
|
||||
mul v23.8h, v7.8h, v0.h[0]
|
||||
addhn v20.8b, v20.8h, v31.8h
|
||||
addhn v21.8b, v21.8h, v31.8h
|
||||
addhn v22.8b, v22.8h, v31.8h
|
||||
addhn v23.8b, v23.8h, v31.8h
|
||||
subs w4, w4, #4
|
||||
st1 {v20.8b}, [x0], x1
|
||||
st1 {v21.8b}, [x6], x1
|
||||
subs w4, w4, #4
|
||||
st1 {v22.8b}, [x0], x1
|
||||
st1 {v23.8b}, [x6], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
320:
|
||||
640:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
sub x2, x2, #4
|
||||
mov x7, #-4
|
||||
// Set up pointers for four rows in parallel; x0, x6, x5, x10
|
||||
add x5, x0, x1
|
||||
add x10, x6, x1
|
||||
lsl x1, x1, #1
|
||||
sub x1, x1, w3, uxtw
|
||||
mov w9, w3
|
||||
|
||||
mov w9, w3 // x9 = uxtw(w3)
|
||||
sub x1, x1, w3, uxtw
|
||||
1:
|
||||
ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
|
||||
ldr s0, [x2, #-4]! // left
|
||||
usubl v0.8h, v0.8b, v5.8b // left-right
|
||||
usubl v1.8h, v1.8b, v5.8b
|
||||
usubl v2.8h, v2.8b, v5.8b
|
||||
usubl v3.8h, v3.8b, v5.8b
|
||||
2:
|
||||
ld1 {v7.16b}, [x8], #16 // weights_hor
|
||||
shll v20.8h, v5.8b, #8 // right*256
|
||||
shll v21.8h, v5.8b, #8
|
||||
shll v22.8h, v5.8b, #8
|
||||
shll v23.8h, v5.8b, #8
|
||||
shll v24.8h, v5.8b, #8
|
||||
shll v25.8h, v5.8b, #8
|
||||
shll v26.8h, v5.8b, #8
|
||||
shll v27.8h, v5.8b, #8
|
||||
uxtl v6.8h, v7.8b // weights_hor
|
||||
uxtl2 v7.8h, v7.16b
|
||||
mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor
|
||||
mla v21.8h, v3.8h, v7.8h // (left flipped)
|
||||
mla v22.8h, v2.8h, v6.8h
|
||||
mla v23.8h, v2.8h, v7.8h
|
||||
mla v24.8h, v1.8h, v6.8h
|
||||
mla v25.8h, v1.8h, v7.8h
|
||||
mla v26.8h, v0.8h, v6.8h
|
||||
mla v27.8h, v0.8h, v7.8h
|
||||
rshrn v20.8b, v20.8h, #8
|
||||
rshrn2 v20.16b, v21.8h, #8
|
||||
rshrn v22.8b, v22.8h, #8
|
||||
rshrn2 v22.16b, v23.8h, #8
|
||||
rshrn v24.8b, v24.8h, #8
|
||||
rshrn2 v24.16b, v25.8h, #8
|
||||
rshrn v26.8b, v26.8h, #8
|
||||
rshrn2 v26.16b, v27.8h, #8
|
||||
mul v20.8h, v6.8h, v0.h[3] // right*256 + (left-right)*weights_hor
|
||||
mul v22.8h, v6.8h, v0.h[2]
|
||||
mul v21.8h, v7.8h, v0.h[3] // (left flipped)
|
||||
mul v23.8h, v7.8h, v0.h[2]
|
||||
subs w3, w3, #16
|
||||
addhn v20.8b, v20.8h, v31.8h
|
||||
addhn v22.8b, v22.8h, v31.8h
|
||||
addhn2 v20.16b, v21.8h, v31.8h
|
||||
addhn2 v22.16b, v23.8h, v31.8h
|
||||
mul v24.8h, v6.8h, v0.h[1]
|
||||
mul v26.8h, v6.8h, v0.h[0]
|
||||
mul v25.8h, v7.8h, v0.h[1]
|
||||
mul v27.8h, v7.8h, v0.h[0]
|
||||
addhn v24.8b, v24.8h, v31.8h
|
||||
addhn v26.8b, v26.8h, v31.8h
|
||||
st1 {v20.16b}, [x0], #16
|
||||
addhn2 v24.16b, v25.8h, v31.8h
|
||||
st1 {v22.16b}, [x6], #16
|
||||
addhn2 v26.16b, v27.8h, v31.8h
|
||||
st1 {v24.16b}, [x5], #16
|
||||
st1 {v26.16b}, [x10], #16
|
||||
b.gt 2b
|
||||
subs w4, w4, #4
|
||||
b.le 9f
|
||||
sub x8, x8, w9, uxtw
|
||||
sub x8, x8, x9
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x5, x5, x1
|
||||
@@ -1395,14 +1362,6 @@ function ipred_smooth_h_8bpc_neon, export=1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
jumptable ipred_smooth_h_tbl
|
||||
.word 640b - ipred_smooth_h_tbl
|
||||
.word 320b - ipred_smooth_h_tbl
|
||||
.word 160b - ipred_smooth_h_tbl
|
||||
.word 80b - ipred_smooth_h_tbl
|
||||
.word 40b - ipred_smooth_h_tbl
|
||||
endjumptable
|
||||
|
||||
const padding_mask_buf
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
|
||||
Reference in New Issue
Block a user