x86: add AVX2 version of mc.resize()

resize_8bpc_c: 1637609.7
resize_8bpc_avx2: 95162.6
This commit is contained in:
Ronald S. Bultje
2020-03-31 13:19:55 +02:00
parent 862e5bc773
commit 9e36b9b001
2 changed files with 156 additions and 0 deletions
+153
View File
@@ -138,13 +138,24 @@ bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8,
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_8x0_8x8: times 8 db 0
times 8 db 8
ALIGN 32
resize_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
resize_shuf: times 5 db 0
db 1, 2, 3, 4, 5, 6
times 5+8 db 7
ALIGN 8
wm_420_perm64: dq 0xfedcba9876543210
wm_420_sign: dd 0x01020102, 0x01010101
wm_422_sign: dd 0x80808080, 0x7f7f7f7f
wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040
ALIGN 4
pw_m128 times 2 dw -128
pw_m256: times 2 dw -256
pw_34: times 2 dw 34
pw_258: times 2 dw 258
pw_512: times 2 dw 512
@@ -154,6 +165,7 @@ pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
pd_2: dd 2
pd_32: dd 32
pd_63: dd 63
pd_512: dd 512
pd_32768: dd 32768
@@ -4987,6 +4999,147 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
.end:
RET
cextern resize_filter
INIT_YMM avx2
cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0
sub dword mx0m, 4<<14
sub dword src_wm, 8
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pw_m256]
vpbroadcastd m7, [base+pd_63]
vbroadcasti128 m15, [base+pb_8x0_8x8]
pmaddwd m2, m5, [base+resize_mul] ; dx*[0,1,2,3,4,5,6,7]
pslld m5, 3 ; dx*8
pslld m6, 14
paddd m8, m2 ; mx+[0..7]*dx
pxor m2, m2
; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
.loop_y:
xor xd, xd
mova m4, m8 ; per-line working version of mx
.loop_x:
pmaxsd m0, m4, m2
psrad m9, m4, 8 ; filter offset (unmasked)
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
psubd m1, m4, m0 ; pshufb offset
psrad m0, 14 ; clipped src_x offset
psrad m1, 14 ; pshufb edge_emu offset
pand m9, m7 ; filter offset (masked)
; load source pixels - this ugly code is vpgatherdq emulation since
; directly using vpgatherdq on Haswell is quite a bit slower :(
movd r8d, xm0
pextrd r9d, xm0, 1
pextrd r10d, xm0, 2
pextrd r11d, xm0, 3
vextracti128 xm0, m0, 1
movq xm12, [srcq+r8]
movq xm13, [srcq+r10]
movhps xm12, [srcq+r9]
movhps xm13, [srcq+r11]
movd r8d, xm0
pextrd r9d, xm0, 1
pextrd r10d, xm0, 2
pextrd r11d, xm0, 3
vinserti128 m12, [srcq+r8], 1
vinserti128 m13, [srcq+r10], 1
vpbroadcastq m10, [srcq+r9]
vpbroadcastq m11, [srcq+r11]
vpblendd m12, m12, m10, 11000000b
vpblendd m13, m13, m11, 11000000b
; if no emulation is required, we don't need to shuffle or emulate edges
; this also saves 2 quasi-vpgatherdqs
vptest m1, m1
jz .filter
movd r8d, xm1
pextrd r9d, xm1, 1
pextrd r10d, xm1, 2
pextrd r11d, xm1, 3
movsxd r8, r8d
movsxd r9, r9d
movsxd r10, r10d
movsxd r11, r11d
vextracti128 xm1, m1, 1
movq xm14, [base+resize_shuf+4+r8]
movq xm0, [base+resize_shuf+4+r10]
movhps xm14, [base+resize_shuf+4+r9]
movhps xm0, [base+resize_shuf+4+r11]
movd r8d, xm1
pextrd r9d, xm1, 1
pextrd r10d, xm1, 2
pextrd r11d, xm1, 3
movsxd r8, r8d
movsxd r9, r9d
movsxd r10, r10d
movsxd r11, r11d
vinserti128 m14, [base+resize_shuf+4+r8], 1
vinserti128 m0, [base+resize_shuf+4+r10], 1
vpbroadcastq m10, [base+resize_shuf+4+r9]
vpbroadcastq m11, [base+resize_shuf+4+r11]
vpblendd m14, m14, m10, 11000000b
vpblendd m0, m0, m11, 11000000b
paddb m14, m15
paddb m0, m15
pshufb m12, m14
pshufb m13, m0
.filter:
movd r8d, xm9
pextrd r9d, xm9, 1
pextrd r10d, xm9, 2
pextrd r11d, xm9, 3
vextracti128 xm9, m9, 1
movq xm10, [base+resize_filter+r8*8]
movq xm11, [base+resize_filter+r10*8]
movhps xm10, [base+resize_filter+r9*8]
movhps xm11, [base+resize_filter+r11*8]
movd r8d, xm9
pextrd r9d, xm9, 1
pextrd r10d, xm9, 2
pextrd r11d, xm9, 3
vinserti128 m10, [base+resize_filter+r8*8], 1
vinserti128 m11, [base+resize_filter+r10*8], 1
vpbroadcastq m14, [base+resize_filter+r9*8]
vpbroadcastq m1, [base+resize_filter+r11*8]
vpblendd m10, m10, m14, 11000000b
vpblendd m11, m11, m1, 11000000b
pmaddubsw m12, m10
pmaddubsw m13, m11
phaddw m12, m13
vextracti128 xm13, m12, 1
phaddsw xm12, xm13
pmulhrsw xm12, xm3 ; x=(x+64)>>7
packuswb xm12, xm12
movq [dstq+xq], xm12
paddd m4, m5
add xd, 8
cmp xd, dst_wd
jl .loop_x
add dstq, dst_strideq
add srcq, src_strideq
dec hd
jg .loop_y
RET
INIT_YMM avx2
PREP_BILIN
PREP_8TAP
+3
View File
@@ -115,6 +115,8 @@ decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
decl_emu_edge_fn(dav1d_emu_edge_avx2);
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
decl_resize_fn(dav1d_resize_avx2);
COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_##suffix
@@ -219,6 +221,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
c->emu_edge = dav1d_emu_edge_avx2;
c->resize = dav1d_resize_avx2;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))