mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
AArch64: Optimize ipred_smooth_v_8bpc_neon further
Optimize ipred_smooth_h_8bpc_neon even further using vertical inner loop for w >= 16 cases. Relative runtime after this patch on some Cortex CPUs: ipred_smooth_v: w4 w8 w16 w32 w64 Cortex-A55: 0.985x 0.981x 0.810x 0.873x 0.907x Cortex-A510: 0.966x 0.951x 0.950x 1.013x 1.047x Cortex-A520: 0.924x 0.924x 0.890x 0.984x 1.030x Cortex-A76: 0.978x 1.036x 0.899x 0.919x 0.918x Cortex-A78: 0.997x 0.993x 0.986x 0.972x 0.983x Cortex-A710: 1.002x 0.973x 0.984x 0.958x 1.002x Cortex-A715: 1.073x 1.049x 1.005x 1.018x 1.012x Cortex-A720: 1.001x 1.004x 0.990x 1.007x 1.008x Cortex-A725: 1.002x 1.001x 0.985x 1.007x 1.006x Cortex-X1: 0.996x 1.077x 0.927x 0.962x 0.970x Cortex-X2: 1.012x 0.989x 0.881x 0.971x 0.981x Cortex-X3: 1.006x 1.034x 0.841x 0.966x 0.962x Cortex-X4: 1.020x 1.022x 0.915x 0.964x 0.985x Cortex-X925: 1.000x 0.947x 0.936x 0.982x 0.996x
This commit is contained in:
+40
-46
@@ -1125,14 +1125,14 @@ function ipred_smooth_v_8bpc_neon, export=1
|
||||
movi v31.8b, #128
|
||||
add x2, x2, #1
|
||||
add x7, x7, w4, uxtw
|
||||
add x6, x0, x1
|
||||
lsl x1, x1, #1
|
||||
zip1 v31.16b, v31.16b, v4.16b // bottom*256 + rnd
|
||||
cmp w3, #8
|
||||
b.gt 160f
|
||||
b.eq 80f
|
||||
40:
|
||||
ld1r {v6.2s}, [x2] // top
|
||||
add x6, x0, x1
|
||||
lsl x1, x1, #1
|
||||
usubl v6.8h, v6.8b, v4.8b // top-bottom
|
||||
4:
|
||||
ldr s7, [x7], #4 // weights_ver
|
||||
@@ -1153,6 +1153,8 @@ function ipred_smooth_v_8bpc_neon, export=1
|
||||
ret
|
||||
80:
|
||||
ld1 {v6.8b}, [x2] // top
|
||||
add x6, x0, x1
|
||||
lsl x1, x1, #1
|
||||
usubl v6.8h, v6.8b, v4.8b // top-bottom
|
||||
8:
|
||||
ldr s7, [x7], #4 // weights_ver
|
||||
@@ -1173,51 +1175,43 @@ function ipred_smooth_v_8bpc_neon, export=1
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
// Set up pointers for four rows in parallel; x0, x6, x5, x8
|
||||
add x5, x0, x1
|
||||
add x8, x6, x1
|
||||
lsl x1, x1, #1
|
||||
mov w9, w3 // x9 = uxtw(w3)
|
||||
sub x1, x1, w3, uxtw
|
||||
1:
|
||||
ldr s7, [x7], #4 // weights_ver
|
||||
uxtl v7.8h, v7.8b // weights_ver
|
||||
2:
|
||||
ld1 {v3.16b}, [x2], #16 // top
|
||||
usubl v2.8h, v3.8b, v4.8b // top-bottom
|
||||
usubl2 v3.8h, v3.16b, v4.16b
|
||||
mul v20.8h, v2.8h, v7.h[0] // bottom*256 + (top-bottom)*weights_ver
|
||||
mul v22.8h, v2.8h, v7.h[1]
|
||||
mul v21.8h, v3.8h, v7.h[0]
|
||||
mul v23.8h, v3.8h, v7.h[1]
|
||||
subs w3, w3, #16
|
||||
ldr q7, [x2], #16 // top
|
||||
mov x5, x7
|
||||
mov w8, w4
|
||||
usubl v6.8h, v7.8b, v4.8b // top-bottom lower
|
||||
usubl2 v7.8h, v7.16b, v4.16b // top-bottom upper
|
||||
mov x9, x0
|
||||
add x10, x0, x1, lsl #1
|
||||
16:
|
||||
ldr s5, [x5], #4 // weights_ver
|
||||
uxtl v5.8h, v5.8b // weights_ver
|
||||
mul v16.8h, v6.8h, v5.h[0] // bottom*256 + (top-bottom)*weights_ver
|
||||
mul v17.8h, v6.8h, v5.h[1]
|
||||
mul v18.8h, v7.8h, v5.h[0] // bottom*256 + (top-bottom)*weights_ver
|
||||
mul v19.8h, v7.8h, v5.h[1]
|
||||
addhn v16.8b, v16.8h, v31.8h
|
||||
addhn v17.8b, v17.8h, v31.8h
|
||||
addhn2 v16.16b, v18.8h, v31.8h
|
||||
addhn2 v17.16b, v19.8h, v31.8h
|
||||
subs w8, w8, #4
|
||||
mul v20.8h, v6.8h, v5.h[2]
|
||||
mul v21.8h, v6.8h, v5.h[3]
|
||||
mul v22.8h, v7.8h, v5.h[2]
|
||||
mul v23.8h, v7.8h, v5.h[3]
|
||||
addhn v20.8b, v20.8h, v31.8h
|
||||
addhn v22.8b, v22.8h, v31.8h
|
||||
addhn2 v20.16b, v21.8h, v31.8h
|
||||
addhn2 v22.16b, v23.8h, v31.8h
|
||||
mul v24.8h, v2.8h, v7.h[2]
|
||||
mul v26.8h, v2.8h, v7.h[3]
|
||||
mul v25.8h, v3.8h, v7.h[2]
|
||||
mul v27.8h, v3.8h, v7.h[3]
|
||||
addhn v24.8b, v24.8h, v31.8h
|
||||
addhn v26.8b, v26.8h, v31.8h
|
||||
st1 {v20.16b}, [x0], #16
|
||||
addhn2 v24.16b, v25.8h, v31.8h
|
||||
st1 {v22.16b}, [x6], #16
|
||||
addhn2 v26.16b, v27.8h, v31.8h
|
||||
st1 {v24.16b}, [x5], #16
|
||||
st1 {v26.16b}, [x8], #16
|
||||
b.gt 2b
|
||||
subs w4, w4, #4
|
||||
b.le 9f
|
||||
sub x2, x2, x9
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x5, x5, x1
|
||||
add x8, x8, x1
|
||||
mov w3, w9
|
||||
b 1b
|
||||
9:
|
||||
addhn v21.8b, v21.8h, v31.8h
|
||||
addhn2 v20.16b, v22.8h, v31.8h
|
||||
addhn2 v21.16b, v23.8h, v31.8h
|
||||
str q16, [x9]
|
||||
str q17, [x9, x1]
|
||||
add x9, x9, x1, lsl #2
|
||||
str q20, [x10]
|
||||
str q21, [x10, x1]
|
||||
add x10, x10, x1, lsl #2
|
||||
b.gt 16b
|
||||
sub w3, w3, #16
|
||||
add x0, x0, #16
|
||||
cbnz w3, 160b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
Reference in New Issue
Block a user