x86: add AVX512-IceLake implementation of HBD 64x32 DCT^2

inv_txfm_add_64x32_dct_dct_0_10bpc_c:           1760.6 ( 1.00x)
inv_txfm_add_64x32_dct_dct_0_10bpc_sse4:         271.1 ( 6.49x)
inv_txfm_add_64x32_dct_dct_0_10bpc_avx2:         121.3 (14.52x)
inv_txfm_add_64x32_dct_dct_0_10bpc_avx512icl:    116.3 (15.14x)
inv_txfm_add_64x32_dct_dct_1_10bpc_c:          66507.4 ( 1.00x)
inv_txfm_add_64x32_dct_dct_1_10bpc_sse4:        3712.4 (17.91x)
inv_txfm_add_64x32_dct_dct_1_10bpc_avx2:        1830.5 (36.33x)
inv_txfm_add_64x32_dct_dct_1_10bpc_avx512icl:    805.4 (82.58x)
inv_txfm_add_64x32_dct_dct_2_10bpc_c:          66491.6 ( 1.00x)
inv_txfm_add_64x32_dct_dct_2_10bpc_sse4:        5325.3 (12.49x)
inv_txfm_add_64x32_dct_dct_2_10bpc_avx2:        2578.5 (25.79x)
inv_txfm_add_64x32_dct_dct_2_10bpc_avx512icl:   1394.5 (47.68x)
inv_txfm_add_64x32_dct_dct_3_10bpc_c:          66490.2 ( 1.00x)
inv_txfm_add_64x32_dct_dct_3_10bpc_sse4:        6418.5 (10.36x)
inv_txfm_add_64x32_dct_dct_3_10bpc_avx2:        3305.6 (20.11x)
inv_txfm_add_64x32_dct_dct_3_10bpc_avx512icl:   2571.5 (25.86x)
inv_txfm_add_64x32_dct_dct_4_10bpc_c:          66508.6 ( 1.00x)
inv_txfm_add_64x32_dct_dct_4_10bpc_sse4:        8671.2 ( 7.67x)
inv_txfm_add_64x32_dct_dct_4_10bpc_avx2:        4054.2 (16.40x)
inv_txfm_add_64x32_dct_dct_4_10bpc_avx512icl:   2691.6 (24.71x)
This commit is contained in:
Ronald S. Bultje
2023-04-18 11:01:53 -04:00
parent 0b809a9281
commit 68d7a76d08
2 changed files with 458 additions and 78 deletions
+1
View File
@@ -359,6 +359,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl);
assign_itx1_bpc_fn (R, 64, 32, 10, avx512icl);
}
#endif
#endif
+457 -78
View File
@@ -691,6 +691,9 @@ ALIGN function_align
REPX {psrad x, 12 }, m4, m5, m6, m7
ret
ALIGN function_align
.main_fast2_rect2:
REPX {paddd x, m13}, m0, m1
REPX {psrad x, 12 }, m0, m1
.main_fast2:
pmulld m0, m12
pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a
@@ -1583,6 +1586,9 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
psrlq m9, m8, 8
jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
ALIGN function_align
.main_fast2_rect2:
REPX {paddd x, m13}, m16, m17
REPX {psrad x, 12 }, m16, m17
.main_fast2:
pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a
pmulld m9, m16, [o(pd_401)] {1to16} ; t8a
@@ -3380,7 +3386,10 @@ ALIGN function_align
mova [cq+64*13], m17
mova [cq+64*15], m16
ret
.main_fast2: ; bottom half is zero
.main_fast2_rect2:
REPX {paddd x, m13}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
.main_fast2: ; bottom 3/4 is zero
pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
pmulld m0, [o(pd_201)] {1to16} ; t16a
pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
@@ -3643,6 +3652,18 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
vpbroadcastd m15, [o(clip_18b_max)]
call .pass1
lea r5, [o_base_8bpc]
call .pass2_start
pxor m12, m12
.right_zero_loop:
mova [cq+r6*8+64+128*3], m12
mova [cq+r6*8+64+128*2], m12
mova [cq+r6*8+64+128*1], m12
mova [cq+r6*8+64+128*0], m12
sub r6d, 16*4
jge .right_zero_loop
mov r6d, 16*28
jmp .end2
.pass2_start:
mova m4, [cq+64+128* 0]
mova m5, [cq+64+128* 1]
mova m6, [cq+64+128* 2]
@@ -3669,22 +3690,15 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
mova m19, [cq+64+128*13]
mova m20, [cq+64+128*14]
mova m21, [cq+64+128*15]
call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
pxor m12, m12
.right_zero_loop:
mova [cq+r6*8+64+128*3], m12
mova [cq+r6*8+64+128*2], m12
mova [cq+r6*8+64+128*1], m12
mova [cq+r6*8+64+128*0], m12
sub r6d, 16*4
jge .right_zero_loop
mov r6d, 16*28
jmp .end2
jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
.fast: ; topleft 16x16 nonzero
cmp eobd, 36
jl .fast2
call .pass1_fast
lea r5, [o_base_8bpc]
call .pass2_fast_start
jmp .end
.pass2_fast_start:
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
mova [cq+128*0], m14
mova [cq+128*1], m15
@@ -3694,8 +3708,7 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
mova [cq+128*5], m19
mova [cq+128*6], m20
mova [cq+128*7], m21
call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
jmp .end
jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
.fast2: ; topleft 8x8 nonzero
movshdup m7, [o(permB)]
mova ym0, [cq+128*0]
@@ -3714,6 +3727,22 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
vpermt2q m17, m7, m3 ; 7 3
call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
call m(idct_16x16_internal_10bpc).main_end
call .pass2_fast2_start
.end:
pxor m12, m12
.end2:
call .pass2_end
.zero_loop:
mova [cq+r6*8+128*3], m12
mova [cq+r6*8+128*2], m12
mova [cq+r6*8+128*1], m12
mova [cq+r6*8+128*0], m12
sub r6d, 16*4
jge .zero_loop
WIN64_RESTORE_XMM
vzeroupper
ret
.pass2_fast2_start:
call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
lea r5, [o_base_8bpc]
punpckhqdq m22, m0, m2 ; 1
@@ -3734,10 +3763,8 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
mova [cq+128*5], m19
mova [cq+128*6], m20
mova [cq+128*7], m21
call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
.end:
pxor m12, m12
.end2:
jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
.pass2_end:
psubsw m9, m0, m29 ; out31
paddsw m0, m29 ; out0
psubsw m29, m1, m28 ; out30
@@ -3779,13 +3806,6 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
paddsw m6, m15 ; out14
psubsw m15, m7, m14 ; out16
paddsw m7, m14 ; out15
.zero_loop:
mova [cq+r6*8+128*3], m12
mova [cq+r6*8+128*2], m12
mova [cq+r6*8+128*1], m12
mova [cq+r6*8+128*0], m12
sub r6d, 16*4
jge .zero_loop
call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
pmulhrsw m0, m11, m15
pmulhrsw m1, m11, m16
@@ -3806,8 +3826,6 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
pmulhrsw m1, m11, m28
pmulhrsw m2, m11, m29
pmulhrsw m3, m11, m9
WIN64_RESTORE_XMM
vzeroupper
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
.dconly:
imul r6d, [cq], 181
@@ -4746,27 +4764,30 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
ret
.pass1_load_spill:
mova [cq+64* 0], m0
mova [cq+64* 1], m1
mova m0, [cq+64* 2]
mova [cq+64* 2], m1
mova m1, [cq+64* 6]
mova [cq+64* 2], m2
mova [cq+64* 3], m3
mova [cq+64* 4], m4
mova [cq+64* 5], m5
mova [cq+64* 6], m6
mova [cq+64* 7], m7
mova [cq+64* 4], m2
mova [cq+64* 6], m3
mova m2, [cq+64*10]
mova m3, [cq+64*14]
mova [cq+64* 8], m23
mova [cq+64* 9], m22
mova [cq+64*10], m21
mova [cq+64*11], m20
mova [cq+64*12], m19
mova [cq+64*13], m18
mova [cq+64*14], m17
mova [cq+64* 8], m4
mova [cq+64*10], m5
mova [cq+64*12], m6
mova [cq+64*14], m7
mova [cq+64* 1], m23
mova [cq+64* 3], m22
mova [cq+64* 5], m21
mova [cq+64* 7], m20
mova [cq+64* 9], m19
mova [cq+64*11], m18
mova [cq+64*13], m17
mova [cq+64*15], m16
ret
ALIGN function_align
.main_part1_fast_rect2:
REPX {paddd x, m13}, m0, m3
REPX {psrad x, 12 }, m0, m3
.main_part1_fast:
pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
pmulld m0, [r4+4*1]{bcstd} ; t32a
@@ -4904,16 +4925,16 @@ ALIGN function_align
cmp r6, r4
jl .main_part2_loop
ret
.idct64_end:
%macro IDCT64_PASS1_END 8
mova m%5, [cq+%1*64] ; t0+n [idct32] + idct64 rounding
.idct64_main_end:
%macro IDCT64_PASS1_END 9
mova m%5, [%9+%1*128] ; t0+n [idct32] + idct64 rounding
psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64]
paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64]
REPX {pmaxsd x, m14}, m%6, m%5
REPX {pminsd x, m15}, m%6, m%5
REPX {paddd x, m11}, m%6, m%5
mova m%2, [rsp+%3*64] ; t32+n [idct64]
mova m%7, [rsp+%4*64] ; t63-n [idct64]
mova m%2, [r3+%3*64] ; t32+n [idct64]
mova m%7, [r3+%4*64] ; t63-n [idct64]
psubd m%8, m%5, m%7 ; out63-n
paddd m%5, m%7 ; out0+n
psubd m%7, m%6, m%2 ; out32+n
@@ -4936,35 +4957,37 @@ ALIGN function_align
%assign %%r3 (16+%1) ; t24-n
%assign %%r4 (23-%1) ; t31-n
%assign %%c1 (%1) ; t0+n
%assign %%c2 (7-%1) ; t7-n
%assign %%c3 (15-%1) ; t8+n
%assign %%c4 (8+%1) ; t15-n
%assign %%c1 (%1) ; t0/8+n
%assign %%c2 (7-%1) ; t7/15-n
IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27 ; out0/31/32/63
IDCT64_PASS1_END %%c4, %%r1, %%m4, %%m5, 28, 29, 30, 31 ; out15/16/47/48
IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63
IDCT64_PASS1_END %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48
packssdw m %+ %%r1, m24, m29
packssdw m %+ %%r4, m28, m25
packssdw m26, m31
packssdw m30, m27
mova [rsp+%%m1*mmsize], m26
mova [rsp+%%m4*mmsize], m30
IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27 ; out7/24/39/56
IDCT64_PASS1_END %%c3, %%r2, %%m3, %%m6, 28, 29, 30, 31 ; out8/23/40/55
mova [r3+%%m5*mmsize], m26
mova [r3+%%m8*mmsize], m30
IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56
IDCT64_PASS1_END %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55
packssdw m %+ %%r2, m24, m29
packssdw m %+ %%r3, m28, m25
packssdw m26, m31
packssdw m30, m27
mova [rsp+%%m2*mmsize], m26
mova [rsp+%%m3*mmsize], m30
mova [r3+%%m6*mmsize], m26
mova [r3+%%m7*mmsize], m30
%endmacro
vpbroadcastd m11, [o(pd_2)]
lea r5, [o_base_8bpc]
IDCT64_PASS1_ENDx4 0
IDCT64_PASS1_ENDx4 1
IDCT64_PASS1_ENDx4 2
IDCT64_PASS1_ENDx4 3
ret
.idct64_end:
vpbroadcastd m11, [o(pd_2)]
lea r4, [cq+64]
mov r3, rsp
lea r5, [o_base_8bpc]
call .idct64_main_end
pxor m12, m12
.zero_loop:
@@ -4975,27 +4998,27 @@ ALIGN function_align
lea r3, [strideq*3]
mov r4, dstq
call .pass2
mova m0, [rsp+ 0*mmsize]
mova m1, [rsp+ 1*mmsize]
mova m2, [rsp+ 2*mmsize]
mova m3, [rsp+ 3*mmsize]
mova m4, [rsp+ 4*mmsize]
mova m5, [rsp+ 5*mmsize]
mova m6, [rsp+ 6*mmsize]
mova m7, [rsp+ 7*mmsize]
mova m16, [rsp+ 8*mmsize]
mova m17, [rsp+ 9*mmsize]
mova m18, [rsp+10*mmsize]
mova m19, [rsp+11*mmsize]
mova m20, [rsp+12*mmsize]
mova m21, [rsp+13*mmsize]
mova m22, [rsp+14*mmsize]
mova m23, [rsp+15*mmsize]
mova m0, [rsp+16*mmsize]
mova m1, [rsp+17*mmsize]
mova m2, [rsp+18*mmsize]
mova m3, [rsp+19*mmsize]
mova m4, [rsp+20*mmsize]
mova m5, [rsp+21*mmsize]
mova m6, [rsp+22*mmsize]
mova m7, [rsp+23*mmsize]
mova m16, [rsp+24*mmsize]
mova m17, [rsp+25*mmsize]
mova m18, [rsp+26*mmsize]
mova m19, [rsp+27*mmsize]
mova m20, [rsp+28*mmsize]
mova m21, [rsp+29*mmsize]
mova m22, [rsp+30*mmsize]
mova m23, [rsp+31*mmsize]
lea dstq, [r4+64]
call .pass2
RET
.pass2:
psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
@@ -5146,6 +5169,9 @@ ALIGN function_align
packssdw m27, m13
packssdw m28, m10
ret
.main_oddhalf_packed_rect2:
REPX {paddd x, m13}, m0, m1
REPX {psrad x, 12 }, m0, m1
.main_oddhalf_packed:
; m0=in1 in5, m1=in7 in3
vbroadcasti32x4 m2, [o(pd_101_501)]
@@ -5290,4 +5316,357 @@ ALIGN function_align
; m4-7=t40-47[a], m16-19=t48-55[a]
ret
cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
lea r5, [o_base]
test eobd, eobd
jz .dconly
PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [o(pd_2896)]
vpbroadcastd m13, [o(pd_2048)]
vpbroadcastd m14, [o(clip_18b_min)]
vpbroadcastd m15, [o(clip_18b_max)]
cmp eobd, 136
jl .fast
add cq, 64
cmp eobd, 543
jge .full
call .pass1_fast ; bottomright 16x16 zero
mov r7d, 16*12
jmp .lefthalf
.full:
call .pass1
mov r7d, 16*28
.lefthalf:
mova [cq+128* 0], m0
mova [cq+128* 1], m1
mova [cq+128* 2], m2
mova [cq+128* 3], m3
mova [cq+128* 4], m14
mova [cq+128* 5], m15
mova [cq+128* 6], m16
mova [cq+128* 7], m17
mova [cq+128* 8], m22
mova [cq+128* 9], m23
mova [cq+128*10], m24
mova [cq+128*11], m25
mova [cq+128*12], m26
mova [cq+128*13], m27
mova [cq+128*14], m28
mova [cq+128*15], m29
sub cq, 64
vpbroadcastd m12, [o(pd_2896)]
vpbroadcastd m13, [o(pd_2048)]
vpbroadcastd m14, [o(clip_18b_min)]
vpbroadcastd m15, [o(clip_18b_max)]
sub rsp, 16*64
call .pass1
add rsp, 16*64
lea r5, [o_base_8bpc]
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
mov r4, dstq
pxor m12, m12
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
lea dstq, [r4+64]
mova m0, [rsp+16*mmsize]
mova m1, [rsp+17*mmsize]
mova m2, [rsp+18*mmsize]
mova m3, [rsp+19*mmsize]
mova m4, [rsp+20*mmsize]
mova m5, [rsp+21*mmsize]
mova m6, [rsp+22*mmsize]
mova m7, [rsp+23*mmsize]
mova m16, [rsp+24*mmsize]
mova m17, [rsp+25*mmsize]
mova m18, [rsp+26*mmsize]
mova m19, [rsp+27*mmsize]
mova m20, [rsp+28*mmsize]
mova m21, [rsp+29*mmsize]
mova m22, [rsp+30*mmsize]
mova m23, [rsp+31*mmsize]
call .transpose
mova [cq+128* 0+64], m0
mova [cq+128* 1+64], m1
mova [cq+128* 2+64], m2
mova [cq+128* 3+64], m3
mova [cq+128* 4+64], m14
mova [cq+128* 5+64], m15
mova [cq+128* 6+64], m16
mova [cq+128* 7+64], m17
mova [cq+128* 8+64], m22
mova [cq+128* 9+64], m23
mova [cq+128*10+64], m24
mova [cq+128*11+64], m25
mova [cq+128*12+64], m26
mova [cq+128*13+64], m27
mova [cq+128*14+64], m28
mova [cq+128*15+64], m29
mova m0, [rsp+ 0*mmsize]
mova m1, [rsp+ 1*mmsize]
mova m2, [rsp+ 2*mmsize]
mova m3, [rsp+ 3*mmsize]
mova m4, [rsp+ 4*mmsize]
mova m5, [rsp+ 5*mmsize]
mova m6, [rsp+ 6*mmsize]
mova m7, [rsp+ 7*mmsize]
mova m16, [rsp+ 8*mmsize]
mova m17, [rsp+ 9*mmsize]
mova m18, [rsp+10*mmsize]
mova m19, [rsp+11*mmsize]
mova m20, [rsp+12*mmsize]
mova m21, [rsp+13*mmsize]
mova m22, [rsp+14*mmsize]
mova m23, [rsp+15*mmsize]
call .transpose
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
pxor m12, m12
.right_zero_loop:
mova [cq+r7*8+64+128*3], m12
mova [cq+r7*8+64+128*2], m12
mova [cq+r7*8+64+128*1], m12
mova [cq+r7*8+64+128*0], m12
sub r7d, 16*4
jge .right_zero_loop
mov r7d, 16*28
jmp .end
.fast: ; topleft 16x16 nonzero
cmp eobd, 36
jl .fast2
call .pass1_fast
lea r5, [o_base_8bpc]
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
mov r4, dstq
pxor m12, m12
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
lea dstq, [r4+64]
mova m0, [rsp+16*mmsize]
mova m1, [rsp+17*mmsize]
mova m2, [rsp+18*mmsize]
mova m3, [rsp+19*mmsize]
mova m4, [rsp+20*mmsize]
mova m5, [rsp+21*mmsize]
mova m6, [rsp+22*mmsize]
mova m7, [rsp+23*mmsize]
mova m16, [rsp+24*mmsize]
mova m17, [rsp+25*mmsize]
mova m18, [rsp+26*mmsize]
mova m19, [rsp+27*mmsize]
mova m20, [rsp+28*mmsize]
mova m21, [rsp+29*mmsize]
mova m22, [rsp+30*mmsize]
mova m23, [rsp+31*mmsize]
call .transpose
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
mov r7d, 16*12
pxor m12, m12
jmp .end
.fast2: ; topleft 8x8 nonzero
movshdup m7, [o(permB)]
mova ym0, [cq+128*1]
mova ym2, [cq+128*5]
mova ym3, [cq+128*3]
mova ym1, [cq+128*7]
vpermt2q m0, m7, m2 ; 1 5
vpermt2q m1, m7, m3 ; 7 3
REPX {pmulld x, m12}, m0, m1
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2
mova [rsp+ 0*mmsize], m0
mova [rsp+ 1*mmsize], m1
mova [rsp+ 2*mmsize], m2
mova [rsp+ 3*mmsize], m3
mova [rsp+ 4*mmsize], m4
mova [rsp+ 5*mmsize], m5
mova [rsp+ 6*mmsize], m6
mova [rsp+ 7*mmsize], m7
mova [rsp+ 8*mmsize], m16
mova [rsp+ 9*mmsize], m17
mova [rsp+10*mmsize], m18
mova [rsp+11*mmsize], m19
mova [rsp+12*mmsize], m20
mova [rsp+13*mmsize], m21
mova [rsp+14*mmsize], m22
mova [rsp+15*mmsize], m23
movshdup m7, [o(permB)]
pmulld ym0, ym12, [cq+128*0]
pmulld ym4, ym12, [cq+128*4]
mova ym16, [cq+128*2]
mova ym5, [cq+128*6]
REPX {paddd x, ym13}, ym0, ym4
REPX {psrad x, 12 }, ym0, ym4
vpermt2q m16, m7, m5 ; 2 6
vpermq m0, m7, m0 ; 0 0
vpermq m4, m7, m4 ; 4 4
pmulld m16, m12
paddd m16, m13
psrad m16, 12
call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
vpbroadcastd m11, [o(pd_1)]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
mova [rsp+16*mmsize], m24
mova [rsp+17*mmsize], m25
mova [rsp+18*mmsize], m26
mova [rsp+19*mmsize], m27
mova [rsp+20*mmsize], m28
mova [rsp+21*mmsize], m29
mova [rsp+22*mmsize], m30
mova [rsp+23*mmsize], m31
vpbroadcastd m13, [o(pd_2048)]
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
mov r7d, 16*4
mov r4, dstq
pxor m12, m12
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
lea dstq, [r4+64]
mova m0, [rsp+16*mmsize]
mova m1, [rsp+17*mmsize]
mova m2, [rsp+18*mmsize]
mova m3, [rsp+19*mmsize]
mova m4, [rsp+20*mmsize]
mova m5, [rsp+21*mmsize]
mova m6, [rsp+22*mmsize]
mova m7, [rsp+23*mmsize]
lea r5, [o_base]
vpbroadcastd m13, [o(pd_2048)]
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
pxor m12, m12
.end:
call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
.zero_loop:
mova [cq+r7*8+128*3], m12
mova [cq+r7*8+128*2], m12
mova [cq+r7*8+128*1], m12
mova [cq+r7*8+128*0], m12
sub r7d, 16*4
jge .zero_loop
RET
.dconly:
imul r6d, [cq], 181
mov [cq], eobd
or r3d, 32
add r6d, 128
sar r6d, 8
imul r6d, 181
add r6d, 384
sar r6d, 9
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
.pass1_fast:
lea r4, [idct64_mul_16bpc]
lea r6, [rsp+4*64+gprsize]
pmulld m0, m12, [cq+128* 1]
pmulld m3, m12, [cq+128*15]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
pmulld m0, m12, [cq+128* 7]
pmulld m3, m12, [cq+128* 9]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
pmulld m0, m12, [cq+128* 5]
pmulld m3, m12, [cq+128*11]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
pmulld m0, m12, [cq+128* 3]
pmulld m3, m12, [cq+128*13]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
pmulld m0, m12, [cq+128* 0]
pmulld m1, m12, [cq+128* 8]
pmulld m16, m12, [cq+128* 4]
pmulld m17, m12, [cq+128*12]
call m(idct_8x16_internal_10bpc).main_fast2_rect2
call m(idct_16x16_internal_10bpc).main_fast2_rect2
call .pass1_load_spill
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2
jmp .pass1_end
.pass1:
lea r4, [idct64_mul_16bpc]
lea r6, [rsp+4*64+gprsize]
pmulld m0, m12, [cq+128* 1]
pmulld m1, m12, [cq+128*31]
pmulld m2, m12, [cq+128*17]
pmulld m3, m12, [cq+128*15]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
pmulld m0, m12, [cq+128* 7]
pmulld m1, m12, [cq+128*25]
pmulld m2, m12, [cq+128*23]
pmulld m3, m12, [cq+128* 9]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
pmulld m0, m12, [cq+128* 5]
pmulld m1, m12, [cq+128*27]
pmulld m2, m12, [cq+128*21]
pmulld m3, m12, [cq+128*11]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
pmulld m0, m12, [cq+128* 3]
pmulld m1, m12, [cq+128*29]
pmulld m2, m12, [cq+128*19]
pmulld m3, m12, [cq+128*13]
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
pmulld m0, m12, [cq+128* 0]
pmulld m1, m12, [cq+128* 8]
pmulld m2, m12, [cq+128*16]
pmulld m3, m12, [cq+128*24]
pmulld m16, m12, [cq+128* 4]
pmulld m17, m12, [cq+128*12]
pmulld m18, m12, [cq+128*20]
pmulld m19, m12, [cq+128*28]
call m(idct_8x16_internal_10bpc).main_fast_rect2
call m(idct_16x16_internal_10bpc).main_fast_rect2
call .pass1_load_spill
pmulld m4, m12, [cq+128*18]
pmulld m5, m12, [cq+128*22]
pmulld m6, m12, [cq+128*26]
pmulld m7, m12, [cq+128*30]
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
.pass1_end:
vpbroadcastd m11, [o(pd_1)]
lea r3, [rsp+gprsize]
lea r4, [cq+8*128]
call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
; transpose one half immediately, we can transpose lower half later
.transpose:
; transpose m0-7,16-23
psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
punpckhqdq m22, m0, m20 ; 1
punpcklqdq m0, m20 ; 0
punpckhqdq m24, m2, m1 ; 5
punpcklqdq m1, m2, m1 ; 4
punpcklqdq m2, m14, m18 ; 8
punpckhqdq m26, m14, m18 ; 9
punpcklqdq m14, m15, m4 ; 2
punpckhqdq m23, m15, m4 ; 3
punpckhqdq m25, m3, m21 ; 7
punpcklqdq m15, m3, m21 ; 6
punpckhqdq m28, m6, m17 ; 13
punpcklqdq m3, m6, m17 ; 12
punpckhqdq m27, m5, m16 ; 11
punpcklqdq m16, m5, m16 ; 10
punpckhqdq m29, m7, m8 ; 15
punpcklqdq m17, m7, m8 ; 14
ret
.pass1_load_spill:
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
mova [cq+128* 0], m0
mova [cq+128* 1], m1
pmulld m0, m12, [cq+128* 2]
pmulld m1, m12, [cq+128* 6]
mova [cq+128* 2], m2
mova [cq+128* 3], m3
pmulld m2, m12, [cq+128*10]
pmulld m3, m12, [cq+128*14]
mova [cq+128* 4], m4
mova [cq+128* 5], m5
mova [cq+128* 6], m6
mova [cq+128* 7], m7
mova [cq+128* 8], m23
mova [cq+128* 9], m22
mova [cq+128*10], m21
mova [cq+128*11], m20
mova [cq+128*12], m19
mova [cq+128*13], m18
mova [cq+128*14], m17
mova [cq+128*15], m16
ret
%endif ; ARCH_X86_64