mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
AArch64: Optimize ipred_smooth_v_8bpc_neon
Optimize ipred_smooth_v_8bpc_neon using simpler arithmetic operations and the removal of jump table. Relative runtime after this patch on some Cortex CPUs: ipred_smooth_v: w4 w8 w16 w32 w64 Cortex-A55: 1.025x 0.847x 0.821x 0.830x 0.852x Cortex-A510: 1.017x 0.923x 0.915x 0.883x 0.840x Cortex-A520: 1.080x 0.972x 0.999x 0.934x 0.876x Cortex-A76: 0.818x 0.575x 0.599x 0.723x 0.744x Cortex-A78: 0.782x 0.571x 0.595x 0.641x 0.685x Cortex-A715: 0.801x 0.586x 0.593x 0.651x 0.694x Cortex-A725: 0.801x 0.579x 0.596x 0.649x 0.692x Cortex-X1: 0.782x 0.560x 0.553x 0.623x 0.682x Cortex-X3: 0.792x 0.594x 0.526x 0.526x 0.604x Cortex-X925: 0.757x 0.678x 0.525x 0.554x 0.577x
This commit is contained in:
committed by
Martin Storsjö
co-authored by
Martin Storsjö
parent
4db1a05aad
commit
51b67010e2
+50
-85
@@ -1119,125 +1119,98 @@ endjumptable
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_smooth_v_8bpc_neon, export=1
|
||||
sub x8, x2, w4, uxtw
|
||||
movrel x7, X(sm_weights)
|
||||
add x7, x7, w4, uxtw
|
||||
clz w9, w3
|
||||
movrel x5, ipred_smooth_v_tbl
|
||||
sub x8, x2, w4, uxtw
|
||||
sub w9, w9, #25
|
||||
ldrsw x9, [x5, w9, uxtw #2]
|
||||
ld1r {v4.16b}, [x8] // bottom
|
||||
ld1r {v4.16b}, [x8] // bottom
|
||||
movi v31.8b, #128
|
||||
add x2, x2, #1
|
||||
add x5, x5, x9
|
||||
add x7, x7, w4, uxtw
|
||||
add x6, x0, x1
|
||||
lsl x1, x1, #1
|
||||
br x5
|
||||
zip1 v31.16b, v31.16b, v4.16b // bottom*256 + rnd
|
||||
cmp w3, #8
|
||||
b.gt 160f
|
||||
b.eq 80f
|
||||
40:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1r {v6.2s}, [x2] // top
|
||||
usubl v6.8h, v6.8b, v4.8b // top-bottom
|
||||
4:
|
||||
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
|
||||
shll v22.8h, v4.8b, #8 // bottom*256
|
||||
shll v23.8h, v4.8b, #8
|
||||
zip1 v16.2s, v16.2s, v17.2s // weights_ver
|
||||
zip1 v18.2s, v18.2s, v19.2s
|
||||
uxtl v16.8h, v16.8b // weights_ver
|
||||
uxtl v18.8h, v18.8b
|
||||
mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
|
||||
mla v23.8h, v6.8h, v18.8h
|
||||
rshrn v22.8b, v22.8h, #8
|
||||
rshrn v23.8b, v23.8h, #8
|
||||
ldr s7, [x7], #4 // weights_ver
|
||||
uxtl v7.8h, v7.8b // weights_ver
|
||||
zip1 v7.8h, v7.8h, v7.8h
|
||||
zip1 v16.8h, v7.8h, v7.8h
|
||||
zip2 v18.8h, v7.8h, v7.8h // splat weights_ver
|
||||
mul v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
|
||||
mul v23.8h, v6.8h, v18.8h
|
||||
addhn v22.8b, v22.8h, v31.8h
|
||||
addhn v23.8b, v23.8h, v31.8h
|
||||
subs w4, w4, #4
|
||||
st1 {v22.s}[0], [x0], x1
|
||||
st1 {v22.s}[1], [x6], x1
|
||||
subs w4, w4, #4
|
||||
st1 {v23.s}[0], [x0], x1
|
||||
st1 {v23.s}[1], [x6], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
80:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v6.8b}, [x2] // top
|
||||
ld1 {v6.8b}, [x2] // top
|
||||
usubl v6.8h, v6.8b, v4.8b // top-bottom
|
||||
8:
|
||||
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
|
||||
shll v24.8h, v4.8b, #8 // bottom*256
|
||||
shll v25.8h, v4.8b, #8
|
||||
shll v26.8h, v4.8b, #8
|
||||
shll v27.8h, v4.8b, #8
|
||||
uxtl v16.8h, v16.8b // weights_ver
|
||||
uxtl v17.8h, v17.8b
|
||||
uxtl v18.8h, v18.8b
|
||||
uxtl v19.8h, v19.8b
|
||||
mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
|
||||
mla v25.8h, v6.8h, v17.8h
|
||||
mla v26.8h, v6.8h, v18.8h
|
||||
mla v27.8h, v6.8h, v19.8h
|
||||
rshrn v24.8b, v24.8h, #8
|
||||
rshrn v25.8b, v25.8h, #8
|
||||
rshrn v26.8b, v26.8h, #8
|
||||
rshrn v27.8b, v27.8h, #8
|
||||
ldr s7, [x7], #4 // weights_ver
|
||||
uxtl v7.8h, v7.8b // weights_ver
|
||||
mul v24.8h, v6.8h, v7.h[0] // bottom*256 + (top-bottom)*weights_ver
|
||||
mul v25.8h, v6.8h, v7.h[1]
|
||||
mul v26.8h, v6.8h, v7.h[2]
|
||||
mul v27.8h, v6.8h, v7.h[3]
|
||||
addhn v24.8b, v24.8h, v31.8h
|
||||
addhn v25.8b, v25.8h, v31.8h
|
||||
addhn v26.8b, v26.8h, v31.8h
|
||||
addhn v27.8b, v27.8h, v31.8h
|
||||
subs w4, w4, #4
|
||||
st1 {v24.8b}, [x0], x1
|
||||
st1 {v25.8b}, [x6], x1
|
||||
subs w4, w4, #4
|
||||
st1 {v26.8b}, [x0], x1
|
||||
st1 {v27.8b}, [x6], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
320:
|
||||
640:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
// Set up pointers for four rows in parallel; x0, x6, x5, x8
|
||||
add x5, x0, x1
|
||||
add x8, x6, x1
|
||||
lsl x1, x1, #1
|
||||
mov w9, w3 // x9 = uxtw(w3)
|
||||
sub x1, x1, w3, uxtw
|
||||
mov w9, w3
|
||||
|
||||
1:
|
||||
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
|
||||
uxtl v16.8h, v16.8b // weights_ver
|
||||
uxtl v17.8h, v17.8b
|
||||
uxtl v18.8h, v18.8b
|
||||
uxtl v19.8h, v19.8b
|
||||
ldr s7, [x7], #4 // weights_ver
|
||||
uxtl v7.8h, v7.8b // weights_ver
|
||||
2:
|
||||
ld1 {v3.16b}, [x2], #16 // top
|
||||
shll v20.8h, v4.8b, #8 // bottom*256
|
||||
shll v21.8h, v4.8b, #8
|
||||
shll v22.8h, v4.8b, #8
|
||||
shll v23.8h, v4.8b, #8
|
||||
shll v24.8h, v4.8b, #8
|
||||
shll v25.8h, v4.8b, #8
|
||||
shll v26.8h, v4.8b, #8
|
||||
shll v27.8h, v4.8b, #8
|
||||
usubl v2.8h, v3.8b, v4.8b // top-bottom
|
||||
usubl2 v3.8h, v3.16b, v4.16b
|
||||
mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
|
||||
mla v21.8h, v3.8h, v16.8h
|
||||
mla v22.8h, v2.8h, v17.8h
|
||||
mla v23.8h, v3.8h, v17.8h
|
||||
mla v24.8h, v2.8h, v18.8h
|
||||
mla v25.8h, v3.8h, v18.8h
|
||||
mla v26.8h, v2.8h, v19.8h
|
||||
mla v27.8h, v3.8h, v19.8h
|
||||
rshrn v20.8b, v20.8h, #8
|
||||
rshrn2 v20.16b, v21.8h, #8
|
||||
rshrn v22.8b, v22.8h, #8
|
||||
rshrn2 v22.16b, v23.8h, #8
|
||||
rshrn v24.8b, v24.8h, #8
|
||||
rshrn2 v24.16b, v25.8h, #8
|
||||
rshrn v26.8b, v26.8h, #8
|
||||
rshrn2 v26.16b, v27.8h, #8
|
||||
mul v20.8h, v2.8h, v7.h[0] // bottom*256 + (top-bottom)*weights_ver
|
||||
mul v22.8h, v2.8h, v7.h[1]
|
||||
mul v21.8h, v3.8h, v7.h[0]
|
||||
mul v23.8h, v3.8h, v7.h[1]
|
||||
subs w3, w3, #16
|
||||
addhn v20.8b, v20.8h, v31.8h
|
||||
addhn v22.8b, v22.8h, v31.8h
|
||||
addhn2 v20.16b, v21.8h, v31.8h
|
||||
addhn2 v22.16b, v23.8h, v31.8h
|
||||
mul v24.8h, v2.8h, v7.h[2]
|
||||
mul v26.8h, v2.8h, v7.h[3]
|
||||
mul v25.8h, v3.8h, v7.h[2]
|
||||
mul v27.8h, v3.8h, v7.h[3]
|
||||
addhn v24.8b, v24.8h, v31.8h
|
||||
addhn v26.8b, v26.8h, v31.8h
|
||||
st1 {v20.16b}, [x0], #16
|
||||
addhn2 v24.16b, v25.8h, v31.8h
|
||||
st1 {v22.16b}, [x6], #16
|
||||
addhn2 v26.16b, v27.8h, v31.8h
|
||||
st1 {v24.16b}, [x5], #16
|
||||
st1 {v26.16b}, [x8], #16
|
||||
b.gt 2b
|
||||
subs w4, w4, #4
|
||||
b.le 9f
|
||||
sub x2, x2, w9, uxtw
|
||||
sub x2, x2, x9
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x5, x5, x1
|
||||
@@ -1248,14 +1221,6 @@ function ipred_smooth_v_8bpc_neon, export=1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
jumptable ipred_smooth_v_tbl
|
||||
.word 640b - ipred_smooth_v_tbl
|
||||
.word 320b - ipred_smooth_v_tbl
|
||||
.word 160b - ipred_smooth_v_tbl
|
||||
.word 80b - ipred_smooth_v_tbl
|
||||
.word 40b - ipred_smooth_v_tbl
|
||||
endjumptable
|
||||
|
||||
// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
|
||||
Reference in New Issue
Block a user