x86: add AVX512-IceLake implementation of HBD 32x64 DCT^2

inv_txfm_add_32x64_dct_dct_0_10bpc_c:           1783.5 ( 1.00x)
inv_txfm_add_32x64_dct_dct_0_10bpc_sse4:         243.3 ( 7.33x)
inv_txfm_add_32x64_dct_dct_0_10bpc_avx2:         119.1 (14.97x)
inv_txfm_add_32x64_dct_dct_0_10bpc_avx512icl:    142.6 (12.50x)
inv_txfm_add_32x64_dct_dct_1_10bpc_c:          50422.5 ( 1.00x)
inv_txfm_add_32x64_dct_dct_1_10bpc_sse4:        2880.5 (17.50x)
inv_txfm_add_32x64_dct_dct_1_10bpc_avx2:        1423.4 (35.43x)
inv_txfm_add_32x64_dct_dct_1_10bpc_avx512icl:    741.6 (67.99x)
inv_txfm_add_32x64_dct_dct_2_10bpc_c:          50433.6 ( 1.00x)
inv_txfm_add_32x64_dct_dct_2_10bpc_sse4:        4015.1 (12.56x)
inv_txfm_add_32x64_dct_dct_2_10bpc_avx2:        1767.7 (28.53x)
inv_txfm_add_32x64_dct_dct_2_10bpc_avx512icl:    960.8 (52.49x)
inv_txfm_add_32x64_dct_dct_3_10bpc_c:          50422.2 ( 1.00x)
inv_txfm_add_32x64_dct_dct_3_10bpc_sse4:        4500.5 (11.20x)
inv_txfm_add_32x64_dct_dct_3_10bpc_avx2:        2111.7 (23.88x)
inv_txfm_add_32x64_dct_dct_3_10bpc_avx512icl:   1777.1 (28.37x)
inv_txfm_add_32x64_dct_dct_4_10bpc_c:          50444.2 ( 1.00x)
inv_txfm_add_32x64_dct_dct_4_10bpc_sse4:        5592.8 ( 9.02x)
inv_txfm_add_32x64_dct_dct_4_10bpc_avx2:        2458.1 (20.52x)
inv_txfm_add_32x64_dct_dct_4_10bpc_avx512icl:   1867.2 (27.02x)
This commit is contained in:
Ronald S. Bultje
2023-04-12 19:16:21 -04:00
parent ed997f5f12
commit 6ae5766724
3 changed files with 370 additions and 3 deletions
+1
View File
@@ -357,6 +357,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
}
#endif
#endif
+366
View File
@@ -176,6 +176,9 @@ cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1
cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast
cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2
SECTION .text
@@ -3104,6 +3107,7 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
imul r6d, [cq], 181
mov [cq], eobd
or r3d, 16
.dconly3:
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -4130,4 +4134,366 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
sar r6d, 10
jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
lea r5, [o_base]
test eobd, eobd
jz .dconly
PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [o(pd_2896)]
vpbroadcastd m13, [o(pd_2048)]
vpbroadcastd m14, [o(clip_18b_min)]
vpbroadcastd m15, [o(clip_18b_max)]
cmp eobd, 136
jl .fast
add cq, 64
cmp eobd, 543
jge .full
call .pass1_fast ; bottomright 16x16 zero
jmp .lefthalf
.full:
call .pass1
mov r3d, 16*28
.lefthalf:
mova [cq+128* 0], m27
mova [cq+128* 1], m14
mova [cq+128* 2], m28
mova [cq+128* 3], m15
mova [cq+128* 4], m22
mova [cq+128* 5], m23
mova [cq+128* 6], m24
mova [cq+128* 7], m25
mova [cq+128* 8], m0
mova [cq+128* 9], m26
mova [cq+128*10], m20
mova [cq+128*11], m21
mova [cq+128*12], m18
mova [cq+128*13], m16
mova [cq+128*14], m17
mova [cq+128*15], m3
sub cq, 64
vpbroadcastd m12, [o(pd_2896)]
vpbroadcastd m13, [o(pd_2048)]
vpbroadcastd m14, [o(clip_18b_min)]
vpbroadcastd m15, [o(clip_18b_max)]
call .pass1
vpbroadcastd m10, [o(pd_2048)]
lea r5, [o_base_8bpc]
mov r4, rsp
mova m1, [cq+128*15+64]
mova m2, [cq+128* 8+64]
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
mova m0, m21
mova m1, [cq+128*12+64]
mova m2, [cq+128*11+64]
mova m3, m18
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
mova m0, m20
mova m1, [cq+128*13+64]
mova m2, [cq+128*10+64]
mova m3, m16
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
mova m0, m26
mova m1, [cq+128*14+64]
mova m2, [cq+128* 9+64]
mova m3, m17
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
mova m0, m27
mova m1, m28
mova m2, [cq+128* 0+64]
mova m3, [cq+128* 2+64]
mova m16, [cq+128* 1+64]
mova m17, [cq+128* 3+64]
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
mova m26, [cq+128* 4+64]
mova m27, [cq+128* 5+64]
mova m28, [cq+128* 6+64]
mova m29, [cq+128* 7+64]
mova [rsp+64*32], m14
mova [rsp+64*33], m15
mova [rsp+64*34], m16
mova [rsp+64*35], m17
mova [rsp+64*36], m18
mova [rsp+64*37], m19
mova [rsp+64*38], m20
mova [rsp+64*39], m21
call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
pxor m31, m31
.right_zero_loop:
REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3
sub r3d, 16*4
jge .right_zero_loop
mov r3d, 16*28
jmp .left_zero_loop
.fast: ; topleft 16x16 nonzero
cmp eobd, 36
jl .fast2
call .pass1_fast
vpbroadcastd m10, [o(pd_2048)]
jmp .end
.fast2: ; topleft 8x8 nonzero
movshdup m7, [o(permB)]
mova ym0, [cq+128*0]
mova ym1, [cq+128*4]
mova ym4, [cq+128*2]
mova ym5, [cq+128*6]
mova ym16, [cq+128*1]
mova ym2, [cq+128*5]
mova ym3, [cq+128*3]
mova ym17, [cq+128*7]
mov r3d, 16*4
vpermq m0, m7, m0 ; 0 0
vpermq m1, m7, m1 ; 4 4
vpermt2q m4, m7, m5 ; 2 6
vpermt2q m16, m7, m2 ; 1 5
vpermt2q m17, m7, m3 ; 7 3
REPX {pmulld x, m12}, m0, m1, m4, m16, m17
REPX {paddd x, m13}, m0, m1, m4, m16, m17
REPX {psrad x, 12 }, m0, m1, m4, m16, m17
call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
vpbroadcastd m11, [o(pd_1)]
call m(idct_16x16_internal_10bpc).main_end2
call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
mova m10, m13
punpcklqdq m27, m0, m2 ; 0
punpckhqdq m0, m2 ; 1
punpcklqdq m22, m3, m4 ; 2
punpckhqdq m26, m3, m4 ; 3
punpcklqdq m14, m5, m7 ; 4
punpckhqdq m20, m5, m7 ; 5
punpcklqdq m23, m6, m8 ; 6
punpckhqdq m21, m6, m8 ; 7
pxor m3, m3
REPX {mova x, m3}, m18, m16, m17, m28, m15, m24, m25
.end:
lea r5, [o_base_8bpc]
mov r4, rsp
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
mova m0, m21
mova m3, m18
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
mova m0, m20
mova m3, m16
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
mova m0, m26
mova m3, m17
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
mova m0, m27
mova m1, m28
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
mova [rsp+64*32], m14
mova [rsp+64*33], m15
mova [rsp+64*34], m16
mova [rsp+64*35], m17
mova [rsp+64*36], m18
mova [rsp+64*37], m19
mova [rsp+64*38], m20
mova [rsp+64*39], m21
call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
pxor m31, m31
.left_zero_loop:
REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3
sub r3d, 16*4
jge .left_zero_loop
DEFINE_ARGS dst, stride, dst2, stride32, stklo, stkhi
vpbroadcastd m30, [pixel_10bpc_max]
vpbroadcastd m13, [pw_2048]
mov stride32q, strideq
shl stride32q, 5
lea stkhiq, [rsp+31*mmsize]
lea dst2q, [dstq+stride32q]
mov stkloq, rsp
sub dst2q, strideq ; dst31
paddsw m8, m0, m29 ; t0[idct32]
psubsw m9, m0, m29 ; t31[idct32]
call .end_sumsub_write
paddsw m8, m1, m28 ; t1[idct32]
psubsw m9, m1, m28 ; t30[idct32]
call .end_sumsub_write
paddsw m8, m2, m27 ; t2[idct32]
psubsw m9, m2, m27 ; t29[idct32]
call .end_sumsub_write
paddsw m8, m3, m26 ; t3[idct32]
psubsw m9, m3, m26 ; t28[idct32]
call .end_sumsub_write
paddsw m8, m4, m25 ; t4[idct32]
psubsw m9, m4, m25 ; t27[idct32]
call .end_sumsub_write
paddsw m8, m5, m24 ; t5[idct32]
psubsw m9, m5, m24 ; t26[idct32]
call .end_sumsub_write
paddsw m8, m6, m23 ; t6[idct32]
psubsw m9, m6, m23 ; t25[idct32]
call .end_sumsub_write
paddsw m8, m7, m22 ; t7[idct32]
psubsw m9, m7, m22 ; t24[idct32]
call .end_sumsub_write
mova m0, [rsp+64*32]
mova m1, [rsp+64*33]
mova m2, [rsp+64*34]
mova m3, [rsp+64*35]
mova m4, [rsp+64*36]
mova m5, [rsp+64*37]
mova m6, [rsp+64*38]
mova m7, [rsp+64*39]
paddsw m8, m0, m21 ; t8[idct32]
psubsw m9, m0, m21 ; t23[idct32]
call .end_sumsub_write
paddsw m8, m1, m20 ; t9[idct32]
psubsw m9, m1, m20 ; t22[idct32]
call .end_sumsub_write
paddsw m8, m2, m19 ; t10[idct32]
psubsw m9, m2, m19 ; t21[idct32]
call .end_sumsub_write
paddsw m8, m3, m18 ; t11[idct32]
psubsw m9, m3, m18 ; t20[idct32]
call .end_sumsub_write
paddsw m8, m4, m17 ; t12[idct32]
psubsw m9, m4, m17 ; t19[idct32]
call .end_sumsub_write
paddsw m8, m5, m16 ; t13[idct32]
psubsw m9, m5, m16 ; t18[idct32]
call .end_sumsub_write
paddsw m8, m6, m15 ; t14[idct32]
psubsw m9, m6, m15 ; t17[idct32]
call .end_sumsub_write
paddsw m8, m7, m14 ; t15[idct32]
psubsw m9, m7, m14 ; t16[idct32]
call .end_sumsub_write
RET
.end_sumsub_write:
mova m10, [stkhiq] ; t63-n
mova m12, [stkloq] ; t32+n
psubsw m11, m8, m10 ; out63-n
paddsw m8, m10 ; out0 +n
psubsw m10, m9, m12 ; out32+n
paddsw m9, m12 ; out32-n
REPX {pmulhrsw x, m13}, m11, m8, m10, m9
paddw m8, [dstq]
paddw m9, [dst2q]
paddw m10, [dstq+stride32q]
paddw m11, [dst2q+stride32q]
REPX {pminsw x, m30}, m11, m8, m10, m9
REPX {pmaxsw x, m31}, m11, m8, m10, m9
mova [dstq ], m8
mova [dst2q ], m9
mova [dstq +stride32q], m10
mova [dst2q+stride32q], m11
add stkloq, mmsize
sub stkhiq, mmsize
add dstq, strideq
sub dst2q, strideq
ret
.dconly:
DEFINE_ARGS dst, stride, c, eob
imul r6d, [cq], 181
mov [cq], eobd
or r3d, 64
jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3
.pass1_fast:
pmulld m0, m12, [cq+128* 0]
pmulld m1, m12, [cq+128* 4]
pmulld m2, m12, [cq+128* 8]
pmulld m3, m12, [cq+128*12]
mov r3d, 16*12
call m(idct_8x16_internal_10bpc).main_fast_rect2
pmulld m16, m12, [cq+128* 2]
pmulld m17, m12, [cq+128* 6]
pmulld m18, m12, [cq+128*10]
pmulld m19, m12, [cq+128*14]
call m(idct_16x16_internal_10bpc).main_fast_rect2
call .pass1_load_spill
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
jmp .pass1_end
.pass1:
pmulld m0, m12, [cq+128* 0]
pmulld m1, m12, [cq+128* 4]
pmulld m2, m12, [cq+128* 8]
pmulld m3, m12, [cq+128*12]
pmulld m4, m12, [cq+128*16]
pmulld m5, m12, [cq+128*20]
pmulld m6, m12, [cq+128*24]
pmulld m7, m12, [cq+128*28]
call m(idct_8x16_internal_10bpc).main_rect2
pmulld m16, m12, [cq+128* 2]
pmulld m17, m12, [cq+128* 6]
pmulld m18, m12, [cq+128*10]
pmulld m19, m12, [cq+128*14]
pmulld m20, m12, [cq+128*18]
pmulld m21, m12, [cq+128*22]
pmulld m22, m12, [cq+128*26]
pmulld m23, m12, [cq+128*30]
call m(idct_16x16_internal_10bpc).main_rect2
call .pass1_load_spill
pmulld m16, m12, [cq+128*17]
pmulld m17, m12, [cq+128*19]
pmulld m18, m12, [cq+128*21]
pmulld m19, m12, [cq+128*23]
pmulld m20, m12, [cq+128*25]
pmulld m21, m12, [cq+128*27]
pmulld m22, m12, [cq+128*29]
pmulld m23, m12, [cq+128*31]
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2
.pass1_end:
vpbroadcastd m11, [o(pd_1)]
lea r4, [cq+128*8]
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
punpcklqdq m27, m0, m20 ; 0
punpckhqdq m0, m20 ; 1
punpcklqdq m24, m5, m16 ; 10
punpckhqdq m16, m5, m16 ; 11
punpcklqdq m23, m3, m21 ; 6
punpckhqdq m21, m3, m21 ; 7
punpcklqdq m25, m7, m8 ; 14
punpckhqdq m3, m7, m8 ; 15
punpcklqdq m22, m15, m4 ; 2
punpckhqdq m26, m15, m4 ; 3
punpcklqdq m15, m6, m17 ; 12
punpckhqdq m17, m6, m17 ; 13
punpcklqdq m28, m14, m18 ; 8
punpckhqdq m18, m14, m18 ; 9
punpcklqdq m14, m2, m1 ; 4
punpckhqdq m20, m2, m1 ; 5
ret
.pass1_load_spill:
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
mova [cq+128* 0], m0
pmulld m0, m12, [cq+128* 1]
mova [cq+128* 1], m1
mova [cq+128* 2], m2
pmulld m1, m12, [cq+128* 3]
pmulld m2, m12, [cq+128* 5]
mova [cq+128* 3], m3
mova [cq+128* 4], m4
pmulld m3, m12, [cq+128* 7]
pmulld m4, m12, [cq+128* 9]
mova [cq+128* 5], m5
mova [cq+128* 6], m6
mova [cq+128* 7], m7
pmulld m5, m12, [cq+128*11]
pmulld m6, m12, [cq+128*13]
pmulld m7, m12, [cq+128*15]
mova [cq+128* 8], m23
mova [cq+128* 9], m22
mova [cq+128*10], m21
mova [cq+128*11], m20
mova [cq+128*12], m19
mova [cq+128*13], m18
mova [cq+128*14], m17
mova [cq+128*15], m16
ret
%endif ; ARCH_X86_64
+3 -3
View File
@@ -6090,7 +6090,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
sar r6d, 8+1
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
ALIGN function_align ; bottom three-quarters are zero
.main_part1_fast:
cglobal_label .main_part1_fast
vpbroadcastd m1, [o(idct64_mul+4*0)]
vpbroadcastd m8, [o(idct64_mul+4*1)]
vpbroadcastd m2, [o(idct64_mul+4*6)]
@@ -6104,7 +6104,7 @@ ALIGN function_align ; bottom three-quarters are zero
mova m6, m3
mova m5, m2
jmp .main_part1b
.main_part1:
cglobal_label .main_part1
; idct64 steps 1-5:
; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
@@ -6163,7 +6163,7 @@ ALIGN function_align ; bottom three-quarters are zero
mova [r4+64*5], m5
add r4, 64*8
ret
.main_part2:
cglobal_label .main_part2
vpbroadcastd m11, [o(pw_1567_3784 -16*13)]
vpbroadcastd m12, [o(pw_m3784_1567 -16*13)]
lea r6, [r4+64*7]