x86: add AVX512-IceLake implementation of HBD 64x16 DCT^2

inv_txfm_add_64x16_dct_dct_0_10bpc_c:            892.0 ( 1.00x)
inv_txfm_add_64x16_dct_dct_0_10bpc_sse4:         131.5 ( 6.78x)
inv_txfm_add_64x16_dct_dct_0_10bpc_avx2:          63.4 (14.07x)
inv_txfm_add_64x16_dct_dct_0_10bpc_avx512icl:     56.8 (15.71x)
inv_txfm_add_64x16_dct_dct_1_10bpc_c:          29253.7 ( 1.00x)
inv_txfm_add_64x16_dct_dct_1_10bpc_sse4:        1639.7 (17.84x)
inv_txfm_add_64x16_dct_dct_1_10bpc_avx2:        1106.8 (26.43x)
inv_txfm_add_64x16_dct_dct_1_10bpc_avx512icl:    532.9 (54.89x)
inv_txfm_add_64x16_dct_dct_2_10bpc_c:          29249.8 ( 1.00x)
inv_txfm_add_64x16_dct_dct_2_10bpc_sse4:        3065.6 ( 9.54x)
inv_txfm_add_64x16_dct_dct_2_10bpc_avx2:        1791.0 (16.33x)
inv_txfm_add_64x16_dct_dct_2_10bpc_avx512icl:   1108.0 (26.40x)
inv_txfm_add_64x16_dct_dct_3_10bpc_c:          29269.1 ( 1.00x)
inv_txfm_add_64x16_dct_dct_3_10bpc_sse4:        3738.2 ( 7.83x)
inv_txfm_add_64x16_dct_dct_3_10bpc_avx2:        1790.9 (16.34x)
inv_txfm_add_64x16_dct_dct_3_10bpc_avx512icl:   1203.8 (24.31x)
inv_txfm_add_64x16_dct_dct_4_10bpc_c:          29337.7 ( 1.00x)
inv_txfm_add_64x16_dct_dct_4_10bpc_sse4:        3749.7 ( 7.82x)
inv_txfm_add_64x16_dct_dct_4_10bpc_avx2:        1791.0 (16.38x)
inv_txfm_add_64x16_dct_dct_4_10bpc_avx512icl:   1203.8 (24.37x)
This commit is contained in:
Ronald S. Bultje
2023-04-13 10:36:38 -04:00
parent 6ae5766724
commit 0b809a9281
3 changed files with 812 additions and 17 deletions
+1
View File
@@ -358,6 +358,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl);
}
#endif
#endif
+1 -1
View File
@@ -98,7 +98,7 @@ clip_18b_max: dd 0x1ffff
clip_20b_min: dd -0x80000
clip_20b_max: dd 0x7ffff
idct64_mul_16bpc:
const idct64_mul_16bpc
dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+810 -16
View File
@@ -75,7 +75,7 @@ pw_2048_m2048: times 16 dw 2048
pw_m2048_2048: times 16 dw -2048
pw_2048: times 16 dw 2048
; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-
; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=--
%macro COEF_PAIR 2-3 0 ; a, b, flags
%if %3 == 1
pd_%1_m%2: dd %1, %1, -%2, -%2
@@ -85,6 +85,10 @@ pd_%1_m%2: dd %1, %1, -%2, -%2
pd_m%1_%2: dd -%1, -%1, %2, %2
%define pd_m%1 (pd_m%1_%2 + 4*0)
%define pd_%2 (pd_m%1_%2 + 4*2)
%elif %3 == 4
pd_m%1_m%2: dd -%1, -%1, -%2, -%2
%define pd_m%1 (pd_m%1_m%2 + 4*0)
%define pd_m%2 (pd_m%1_m%2 + 4*2)
%else
pd_%1_%2: dd %1, %1, %2, %2
%define pd_%1 (pd_%1_%2 + 4*0)
@@ -96,10 +100,14 @@ dd -%2, -%2
%endif
%endmacro
COEF_PAIR 101, 501
COEF_PAIR 201, 601, 1
COEF_PAIR 201, 995
COEF_PAIR 401, 1189, 1
COEF_PAIR 401, 1931
COEF_PAIR 401, 3920
COEF_PAIR 401, 4076
COEF_PAIR 700, 301, 4
COEF_PAIR 799, 2276, 1
COEF_PAIR 799, 3406
COEF_PAIR 799, 4017
@@ -119,10 +127,13 @@ COEF_PAIR 3703, 3290
COEF_PAIR 3857, 4052
COEF_PAIR 4017, 2276
COEF_PAIR 4017, 3406
COEF_PAIR 4036, 4085
COEF_PAIR 4076, 1189
COEF_PAIR 4076, 3612
COEF_PAIR 4076, 3920
COEF_PAIR 4091, 3973
COEF_PAIR 4091, 4052
COEF_PAIR 4095, 4065
pb_32: times 4 db 32
pw_5: times 2 dw 5
@@ -146,6 +157,7 @@ pd_5793: dd 5793
cextern dup16_perm
cextern int8_permA
cextern idct64_mul_16bpc
cextern idct_8x8_internal_8bpc_avx512icl.main
cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
cextern idct_8x16_internal_8bpc_avx512icl.main
@@ -679,6 +691,20 @@ ALIGN function_align
REPX {psrad x, 12 }, m4, m5, m6, m7
ret
ALIGN function_align
.main_fast2:
pmulld m0, m12
pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a
pmulld m8, m1, [o(pd_799)] {1to16} ; t4a
REPX {paddd x, m13}, m0, m6, m8
REPX {psrad x, 12 }, m0, m6, m8
pmulld m5, m6, m12
pmulld m1, m8, m12
paddd m5, m13
psubd m4, m5, m1
paddd m5, m1
REPX {psrad x, 12 }, m4, m5
REPX {mova x, m0 }, m1, m2, m3
ret
.main_fast_rect2:
REPX {paddd x, m13}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
@@ -1557,6 +1583,20 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
psrlq m9, m8, 8
jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
ALIGN function_align
.main_fast2:
pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a
pmulld m9, m16, [o(pd_401)] {1to16} ; t8a
pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a
pmulld m17, [o(pd_3920)] {1to16} ; t12a
psubd m18, m13, m18
REPX {paddd x, m13}, m22, m9, m17
REPX {psrad x, 12 }, m18, m22, m9, m17
mova m20, m9
mova m16, m18
mova m23, m22
mova m19, m17
jmp .main3
.main_fast_rect2:
REPX {paddd x, m13}, m16, m17, m18, m19
REPX {psrad x, 12 }, m16, m17, m18, m19
@@ -1590,14 +1630,15 @@ ALIGN function_align
psubd m23, m19 ; t14
psubd m19, m17, m21 ; t13
paddd m17, m21 ; t12
vpbroadcastd m11, [o(pd_3784)]
REPX {pmaxsd x, m14}, m20, m23, m16, m19
vpbroadcastd m10, [o(pd_1567)]
REPX {pminsd x, m15}, m20, m23, m16, m19
ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
REPX {pmaxsd x, m14}, m9, m18, m22, m17
REPX {pminsd x, m15}, m9, m18, m22, m17
.main3:
vpbroadcastd m11, [o(pd_3784)]
vpbroadcastd m10, [o(pd_1567)]
ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
paddd m21, m20, m19 ; t14
psubd m20, m19 ; t13
psubd m19, m9, m18 ; t11a
@@ -2441,6 +2482,80 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
sar r6d, 10
jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
ALIGN function_align
.main_fast3:
; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3
vbroadcasti32x4 m5, [o(pd_401_4076)]
pmulld m3, m0, m12
pmulld m4, m5
REPX {paddd x, m13}, m3, m4
REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a
; t8a t15a -> t8/9 t14/15
vbroadcasti32x4 m5, [o(pd_3784_m3784)]
pshufd m7, m4, q1032
pmulld m6, m4, [o(pd_1567)]{bcstd}
pmulld m5, m7
paddd m6, m13
paddd m5, m6
psrad m5, 12 ; m5=t9a t14a
; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4]
shufps m6, m4, m5, q1032 ; t12 t13
shufps m8, m4, m5, q3210 ; t11a t10
pmulld m9, m6, m12
pmulld m7, m8, m12
paddd m9, m13
paddd m5, m9, m7 ; t12 t13a
psubd m4, m9, m7 ; t11 t10a
REPX {psrad x, 12 }, m5, m4
psubd m7, m3, m6 ; dct16 out15 out14
paddd m0, m3, m6 ; dct16 out0 out1
psubd m6, m3, m5 ; dct16 out12 out13
paddd m1, m3, m5 ; dct16 out3 out2
psubd m5, m3, m4 ; dct16 out11 out10
paddd m2, m3, m4 ; dct16 out4 out5
psubd m4, m3, m8 ; dct16 out8 out9
paddd m3, m8 ; dct16 out7 out6
REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
; idct32_bottomhalf
vbroadcasti32x4 m18, [o(pd_201_m601)]
vbroadcasti32x4 m19, [o(pd_4091_4052)]
pmulld m17, m16, m19
pmulld m16, m18
REPX {paddd x, m13}, m17, m16
REPX {psrad x, 12 }, m17, m16
; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2]
vbroadcasti32x4 m10, [o(pd_799_m2276)]
vbroadcasti32x4 m11, [o(pd_4017_3406)]
pmulld m18, m17, m10
pmulld m19, m17, m11
pmulld m8, m16, m11
pmulld m9, m16, m10
REPX {paddd x, m13}, m18, m19
psubd m18, m8
paddd m19, m9
REPX {psrad x, 12 }, m18, m19
; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a
; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26
punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26]
punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18]
punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21]
punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30]
mova m21, m23
mova m18, m20
mova m17, m22
mova m19, m16
jmp .main4
.main_fast2: ; bottom three-quarters are zero
vbroadcasti32x4 m8, [o(pd_799_4017)]
pmulld m8, m1 ; t4 t7
@@ -2541,8 +2656,6 @@ ALIGN function_align
punpckhqdq m23, m9 ; t27 t26a
punpckhqdq m9, m17, m18 ; t24 t25a
punpcklqdq m17, m18 ; t28 t29a
vpbroadcastd m11, [o(pd_3784)]
vpbroadcastd m10, [o(pd_1567)]
psubd m18, m16, m20 ; t19a t18
paddd m20, m16 ; t16a t17
psubd m16, m19, m17 ; t28a t29
@@ -2553,10 +2666,13 @@ ALIGN function_align
paddd m23, m9 ; t24a t25
REPX {pmaxsd x, m14}, m18, m16, m17, m21
REPX {pminsd x, m15}, m16, m18, m21, m17
ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
REPX {pmaxsd x, m14}, m20, m22, m19, m23
REPX {pminsd x, m15}, m20, m22, m19, m23
.main4:
vpbroadcastd m11, [o(pd_3784)]
vpbroadcastd m10, [o(pd_1567)]
ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
paddd m9, m20, m22 ; t16 t17a
psubd m20, m22 ; t23 t22a
paddd m22, m19, m23 ; t31 t30a
@@ -3174,21 +3290,22 @@ ALIGN function_align
%endmacro
IDCT32_PASS1_END 0, 23 ; 0 16, 15 31
IDCT32_PASS1_END 7, 16 ; 7 23, 8 24
IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
.transpose_16x32:
mova m14, m13
vpermi2q m14, m0, m16
vpermt2q m0, m12, m16
IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
mova m15, m13
vpermi2q m15, m1, m17
vpermt2q m1, m12, m17
IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
mova m16, m13
vpermi2q m16, m2, m18
vpermt2q m2, m12, m18
IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
mova m17, m13
vpermi2q m17, m3, m19
vpermt2q m3, m12, m19
@@ -3263,6 +3380,27 @@ ALIGN function_align
mova [cq+64*13], m17
mova [cq+64*15], m16
ret
.main_fast2: ; bottom half is zero
pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
pmulld m0, [o(pd_201)] {1to16} ; t16a
pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
pmulld m3, [o(pd_3857)] {1to16} ; t28a
pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a
pmulld m2, [o(pd_995)] {1to16} ; t20a
pmulld m6, m1, [o(pd_601)] {1to16} ; t23a
pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a
REPX {psubd x, m13, x}, m20, m6
REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17
REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17
mova m8, m0
mova m16, m23
mova m7, m20
mova m4, m3
mova m19, m2
mova m18, m21
mova m5, m6
mova m22, m17
jmp .main3
.main_fast_rect2:
call m(idct_8x16_internal_10bpc).round
.main_fast: ; bottom half is zero
@@ -3323,9 +3461,10 @@ ALIGN function_align
psubd m22, m1, m17 ; t25
paddd m17, m1 ; t24
REPX {pmaxsd x, m14}, m5, m6, m22, m17
REPX {pminsd x, m15}, m5, m6, m22, m17
.main3:
vpbroadcastd m11, [o(pd_4017)]
vpbroadcastd m10, [o(pd_799)]
REPX {pminsd x, m15}, m5, m6, m22, m17
ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a
ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
vpbroadcastd m11, [o(pd_2276)]
@@ -4496,4 +4635,659 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
mova [cq+128*15], m16
ret
cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
%undef cmp
lea r5, [o_base]
test eobd, eobd
jz .dconly
PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [o(pd_2896)]
vpbroadcastd m13, [o(pd_2048)]
vpbroadcastd m14, [o(clip_18b_min)]
vpbroadcastd m15, [o(clip_18b_max)]
cmp eobd, 36
jl .fast ; 8x8
cmp eobd, 151
jge .full ; 16x16
lea r4, [idct64_mul_16bpc]
lea r6, [rsp+4*64]
mova m0, [cq+64* 1]
mova m3, [cq+64*15]
call .main_part1_fast
mova m0, [cq+64* 7]
mova m3, [cq+64* 9]
call .main_part1_fast
mova m0, [cq+64* 5]
mova m3, [cq+64*11]
call .main_part1_fast
mova m0, [cq+64* 3]
mova m3, [cq+64*13]
call .main_part1_fast
call .main_part2
mova m0, [cq+64* 0]
mova m1, [cq+64* 8]
mova m16, [cq+64* 4]
mova m17, [cq+64*12]
call m(idct_8x16_internal_10bpc).main_fast2
call m(idct_16x16_internal_10bpc).main_fast2
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
call .pass1_load_spill
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
mov r6d, 12*8
jmp .idct64_end
.full:
lea r4, [idct64_mul_16bpc]
lea r6, [rsp+4*64]
mova m0, [cq+64* 1]
mova m1, [cq+64*31]
mova m2, [cq+64*17]
mova m3, [cq+64*15]
call .main_part1
mova m0, [cq+64* 7]
mova m1, [cq+64*25]
mova m2, [cq+64*23]
mova m3, [cq+64* 9]
call .main_part1
mova m0, [cq+64* 5]
mova m1, [cq+64*27]
mova m2, [cq+64*21]
mova m3, [cq+64*11]
call .main_part1
mova m0, [cq+64* 3]
mova m1, [cq+64*29]
mova m2, [cq+64*19]
mova m3, [cq+64*13]
call .main_part1
call .main_part2
mova m0, [cq+64* 0]
mova m1, [cq+64* 8]
mova m2, [cq+64*16]
mova m3, [cq+64*24]
mova m16, [cq+64* 4]
mova m17, [cq+64*12]
mova m18, [cq+64*20]
mova m19, [cq+64*28]
call m(idct_8x16_internal_10bpc).main_fast
call m(idct_16x16_internal_10bpc).main_fast
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
call .pass1_load_spill
mova m4, [cq+64*18]
mova m5, [cq+64*22]
mova m6, [cq+64*26]
mova m7, [cq+64*30]
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
mov r6d, 28*8
jmp .idct64_end
.dconly:
imul r6d, [cq], 181
mov [cq], eobd
or r3d, 16
add r6d, 640
sar r6d, 10
.dconly2:
vpbroadcastd m3, [o(dconly_10bpc)]
imul r6d, 181
add r6d, 2176
sar r6d, 12
vpbroadcastw m2, r6d
paddsw m2, m3
.dconly_loop:
paddsw m0, m2, [dstq+64*0]
paddsw m1, m2, [dstq+64*1]
psubusw m0, m3
psubusw m1, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, strideq
dec r3d
jg .dconly_loop
ret
.pass1_load_spill:
mova [cq+64* 0], m0
mova [cq+64* 1], m1
mova m0, [cq+64* 2]
mova m1, [cq+64* 6]
mova [cq+64* 2], m2
mova [cq+64* 3], m3
mova [cq+64* 4], m4
mova [cq+64* 5], m5
mova [cq+64* 6], m6
mova [cq+64* 7], m7
mova m2, [cq+64*10]
mova m3, [cq+64*14]
mova [cq+64* 8], m23
mova [cq+64* 9], m22
mova [cq+64*10], m21
mova [cq+64*11], m20
mova [cq+64*12], m19
mova [cq+64*13], m18
mova [cq+64*14], m17
mova [cq+64*15], m16
ret
ALIGN function_align
.main_part1_fast:
pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
pmulld m0, [r4+4*1]{bcstd} ; t32a
pmulld m4, m3, [r4+4*6]{bcstd} ; t60a
pmulld m3, [r4+4*7]{bcstd} ; t35a
vpbroadcastd m10, [r4+4*8]
vpbroadcastd m11, [r4+4*9]
REPX {paddd x, m13}, m7, m0, m4, m3
REPX {psrad x, 12 }, m7, m0, m4, m3
mova m8, m0
mova m1, m7
mova m6, m3
mova m2, m4
jmp .main_part1b
.main_part1_rect2:
REPX {paddd x, m13}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
.main_part1: ; idct64 steps 1-5
; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
pmulld m0, [r4+4*1]{bcstd} ; t32a
pmulld m6, m1, [r4+4*2]{bcstd} ; t62a
pmulld m1, [r4+4*3]{bcstd} ; t33a
pmulld m5, m2, [r4+4*4]{bcstd} ; t61a
pmulld m2, [r4+4*5]{bcstd} ; t34a
pmulld m4, m3, [r4+4*6]{bcstd} ; t60a
pmulld m3, [r4+4*7]{bcstd} ; t35a
vpbroadcastd m10, [r4+4*8]
vpbroadcastd m11, [r4+4*9]
REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3
REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
psubd m8, m0, m1 ; t33
paddd m0, m1 ; t32
psubd m1, m7, m6 ; t62
paddd m7, m6 ; t63
psubd m6, m3, m2 ; t34
paddd m3, m2 ; t35
psubd m2, m4, m5 ; t61
paddd m4, m5 ; t60
.main_part1b:
REPX {pmaxsd x, m14}, m8, m1, m6, m2
REPX {pminsd x, m15}, m8, m1, m6, m2
ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a
ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a
REPX {pmaxsd x, m14}, m0, m3, m7, m4
REPX {pminsd x, m15}, m0, m3, m7, m4
vpbroadcastd m10, [r4+4*10]
vpbroadcastd m11, [r4+4*11]
psubd m5, m0, m3 ; t35a
paddd m0, m3 ; t32a
psubd m3, m7, m4 ; t60a
paddd m7, m4 ; t63a
psubd m4, m1, m6 ; t34
paddd m1, m6 ; t33
psubd m6, m8, m2 ; t61
paddd m8, m2 ; t62
REPX {pmaxsd x, m14}, m5, m3, m4, m6
REPX {pminsd x, m15}, m5, m3, m4, m6
ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60
ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a
REPX {pmaxsd x, m14}, m0, m7, m1, m8
REPX {pminsd x, m15}, m0, m7, m1, m8
add r4, 4*12
mova [r6-64*4], m0
mova [r6+64*3], m7
mova [r6-64*3], m1
mova [r6+64*2], m8
mova [r6-64*2], m6
mova [r6+64*1], m4
mova [r6-64*1], m3
mova [r6+64*0], m5
add r6, 64*8
ret
.main_part2: ; idct64 steps 6-9
lea r4, [r6+64*3]
sub r6, 64*4
vpbroadcastd m10, [pd_1567]
vpbroadcastd m11, [pd_3784]
.main_part2_loop:
mova m0, [r6-64*32] ; t32a
mova m1, [r4-64*24] ; t39a
mova m2, [r4-64*32] ; t63a
mova m3, [r6-64*24] ; t56a
mova m4, [r6-64*16] ; t40a
mova m5, [r4-64* 8] ; t47a
mova m6, [r4-64*16] ; t55a
mova m7, [r6-64* 8] ; t48a
psubd m8, m0, m1 ; t39
paddd m0, m1 ; t32
psubd m1, m2, m3 ; t56
paddd m2, m3 ; t63
psubd m3, m5, m4 ; t40
paddd m5, m4 ; t47
psubd m4, m7, m6 ; t55
paddd m7, m6 ; t48
REPX {pmaxsd x, m14}, m8, m1, m3, m4
REPX {pminsd x, m15}, m8, m1, m3, m4
ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a
ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a
REPX {pmaxsd x, m14}, m0, m2, m5, m7
REPX {pminsd x, m15}, m0, m5, m2, m7
psubd m6, m2, m7 ; t48a
paddd m2, m7 ; t63a
psubd m7, m0, m5 ; t47a
paddd m0, m5 ; t32a
psubd m5, m8, m4 ; t55
paddd m8, m4 ; t56
psubd m4, m1, m3 ; t40
paddd m1, m3 ; t39
REPX {pmaxsd x, m14}, m6, m7, m5, m4
REPX {pminsd x, m15}, m6, m7, m5, m4
REPX {pmulld x, m12}, m6, m7, m5, m4
REPX {pmaxsd x, m14}, m2, m0, m8, m1
REPX {pminsd x, m15}, m2, m0, m8, m1
paddd m6, m13
paddd m5, m13
psubd m3, m6, m7 ; t47
paddd m6, m7 ; t48
psubd m7, m5, m4 ; t40a
paddd m5, m4 ; t55a
REPX {psrad x, 12}, m3, m6, m7, m5
mova [r4-64* 8], m2
mova [r6-64*32], m0
mova [r6-64* 8], m8
mova [r4-64*32], m1
mova [r4-64*24], m3
mova [r6-64*16], m6
mova [r6-64*24], m7
mova [r4-64*16], m5
add r6, 64
sub r4, 64
cmp r6, r4
jl .main_part2_loop
ret
.idct64_end:
%macro IDCT64_PASS1_END 8
mova m%5, [cq+%1*64] ; t0+n [idct32] + idct64 rounding
psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64]
paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64]
REPX {pmaxsd x, m14}, m%6, m%5
REPX {pminsd x, m15}, m%6, m%5
REPX {paddd x, m11}, m%6, m%5
mova m%2, [rsp+%3*64] ; t32+n [idct64]
mova m%7, [rsp+%4*64] ; t63-n [idct64]
psubd m%8, m%5, m%7 ; out63-n
paddd m%5, m%7 ; out0+n
psubd m%7, m%6, m%2 ; out32+n
paddd m%6, m%2 ; out31-n
REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6
%endmacro
%macro IDCT64_PASS1_ENDx4 1
%assign %%m1 %1 ; t32+n
%assign %%m2 (7-%1) ; t39-n
%assign %%m3 (8+%1) ; t40+n
%assign %%m4 (15-%1) ; t47-n
%assign %%m5 (16+%1) ; t48+n
%assign %%m6 (23-%1) ; t55-n
%assign %%m7 (24+%1) ; t56+n
%assign %%m8 (31-%1) ; t63-n
%assign %%r1 %1 ; t16+n
%assign %%r2 (7-%1) ; t23-n
%assign %%r3 (16+%1) ; t24-n
%assign %%r4 (23-%1) ; t31-n
%assign %%c1 (%1) ; t0+n
%assign %%c2 (7-%1) ; t7-n
%assign %%c3 (15-%1) ; t8+n
%assign %%c4 (8+%1) ; t15-n
IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27 ; out0/31/32/63
IDCT64_PASS1_END %%c4, %%r1, %%m4, %%m5, 28, 29, 30, 31 ; out15/16/47/48
packssdw m %+ %%r1, m24, m29
packssdw m %+ %%r4, m28, m25
packssdw m26, m31
packssdw m30, m27
mova [rsp+%%m1*mmsize], m26
mova [rsp+%%m4*mmsize], m30
IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27 ; out7/24/39/56
IDCT64_PASS1_END %%c3, %%r2, %%m3, %%m6, 28, 29, 30, 31 ; out8/23/40/55
packssdw m %+ %%r2, m24, m29
packssdw m %+ %%r3, m28, m25
packssdw m26, m31
packssdw m30, m27
mova [rsp+%%m2*mmsize], m26
mova [rsp+%%m3*mmsize], m30
%endmacro
vpbroadcastd m11, [o(pd_2)]
lea r5, [o_base_8bpc]
IDCT64_PASS1_ENDx4 0
IDCT64_PASS1_ENDx4 1
IDCT64_PASS1_ENDx4 2
IDCT64_PASS1_ENDx4 3
pxor m12, m12
.zero_loop:
REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3
sub r6d, 8*4
jge .zero_loop
lea r3, [strideq*3]
mov r4, dstq
call .pass2
mova m0, [rsp+ 0*mmsize]
mova m1, [rsp+ 1*mmsize]
mova m2, [rsp+ 2*mmsize]
mova m3, [rsp+ 3*mmsize]
mova m4, [rsp+ 4*mmsize]
mova m5, [rsp+ 5*mmsize]
mova m6, [rsp+ 6*mmsize]
mova m7, [rsp+ 7*mmsize]
mova m16, [rsp+ 8*mmsize]
mova m17, [rsp+ 9*mmsize]
mova m18, [rsp+10*mmsize]
mova m19, [rsp+11*mmsize]
mova m20, [rsp+12*mmsize]
mova m21, [rsp+13*mmsize]
mova m22, [rsp+14*mmsize]
mova m23, [rsp+15*mmsize]
lea dstq, [r4+64]
call .pass2
RET
.pass2:
psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
punpckhqdq m19, m5, m16 ; 11
punpcklqdq m5, m16 ; 10
punpckhqdq m16, m2, m1 ; 5
punpcklqdq m2, m1 ; 4
punpcklqdq m1, m15, m4 ; 2
punpckhqdq m15, m4 ; 3
punpcklqdq m4, m14, m18 ; 8
punpckhqdq m18, m14, m18 ; 9
punpckhqdq m14, m0, m20 ; 1
punpcklqdq m0, m20 ; 0
punpckhqdq m20, m6, m17 ; 13
punpcklqdq m6, m17 ; 12
punpckhqdq m17, m3, m21 ; 7
punpcklqdq m3, m21 ; 6
punpckhqdq m21, m7, m8 ; 15
punpcklqdq m7, m8 ; 14
call m(inv_txfm_add_dct_dct_32x8_8bpc).main
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
.write:
vpbroadcastd m11, [pw_2048]
pxor m12, m12
vpbroadcastd m13, [pixel_10bpc_max]
call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
pmulhrsw m0, m11, m14
pmulhrsw m1, m11, m15
pmulhrsw m2, m11, m16
pmulhrsw m3, m11, m17
call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
pmulhrsw m0, m11, m18
pmulhrsw m1, m11, m19
pmulhrsw m2, m11, m20
pmulhrsw m3, m11, m21
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
.fast: ; 8x8 packed
movshdup m7, [o(permB)]
mova ym0, [cq+64*1]
mova ym2, [cq+64*5]
mova ym3, [cq+64*3]
mova ym1, [cq+64*7]
vpermt2q m0, m7, m2 ; 1 5
vpermt2q m1, m7, m3 ; 7 3
call .main_oddhalf_packed
mova [rsp+ 0*mmsize], m0
mova [rsp+ 1*mmsize], m1
mova [rsp+ 2*mmsize], m2
mova [rsp+ 3*mmsize], m3
mova [rsp+ 4*mmsize], m4
mova [rsp+ 5*mmsize], m5
mova [rsp+ 6*mmsize], m6
mova [rsp+ 7*mmsize], m7
mova [rsp+ 8*mmsize], m16
mova [rsp+ 9*mmsize], m17
mova [rsp+10*mmsize], m18
mova [rsp+11*mmsize], m19
mova [rsp+12*mmsize], m20
mova [rsp+13*mmsize], m21
mova [rsp+14*mmsize], m22
mova [rsp+15*mmsize], m23
movshdup m7, [o(permB)]
mova ym0, [cq+64*0]
mova ym4, [cq+64*4]
mova ym16, [cq+64*2]
mova ym5, [cq+64*6]
vpermt2q m16, m7, m5 ; 2 6
vpermq m0, m7, m0 ; 0 0
vpermq m4, m7, m4 ; 4 4
call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
; m0-7,9,16-22 contain un-sumsub'ed dct32 output data
; zero input coefs
pxor m12, m12
REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
vpbroadcastd m11, [o(pd_2)]
call .main_end
lea r3, [strideq*3]
mov r4, dstq
call .pass2_fast
mova m0, m24
mova m1, m25
mova m2, m26
mova m3, m27
mova m4, m28
mova m5, m29
mova m6, m30
mova m7, m31
lea dstq, [r4+64]
lea r5, [o_base]
call .pass2_fast
RET
.pass2_fast:
call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
lea r5, [o_base_8bpc]
punpckhqdq m14, m0, m2 ; 1
punpcklqdq m0, m2 ; 0
punpcklqdq m1, m3, m4 ; 2
punpckhqdq m15, m3, m4 ; 3
punpcklqdq m2, m5, m7 ; 4
punpckhqdq m16, m5, m7 ; 5
punpcklqdq m3, m6, m8 ; 6
punpckhqdq m17, m6, m8 ; 7
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
jmp .write
.main_end:
%macro IDCT64_PASS1_PACKED_END 7
psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64]
paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64]
REPX {pmaxsd x, m14}, m%5, m%1
REPX {pminsd x, m15}, m%5, m%1
REPX {paddd x, m11}, m%5, m%1
mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64]
mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64]
psubd m%4, m%1, m%3 ; out63-n
paddd m%1, m%3 ; out0+n
psubd m%3, m%5, m%2 ; out32+n
paddd m%2, m%5 ; out31-n
REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2
%endmacro
IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62
IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49
packssdw m0, m9
packssdw m7, m22
packssdw m24, m13
packssdw m31, m10
IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61
IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50
packssdw m1, m16
packssdw m6, m21
packssdw m25, m13
packssdw m30, m10
IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58
IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53
packssdw m2, m17
packssdw m5, m20
packssdw m26, m13
packssdw m29, m10
IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57
IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54
packssdw m3, m18
packssdw m4, m19
packssdw m27, m13
packssdw m28, m10
ret
.main_oddhalf_packed:
; m0=in1 in5, m1=in7 in3
vbroadcasti32x4 m2, [o(pd_101_501)]
vbroadcasti32x4 m3, [o(pd_m700_m301)]
vbroadcasti32x4 m4, [o(pd_4095_4065)]
vbroadcasti32x4 m5, [o(pd_4036_4085)]
pmulld m2, m0
pmulld m3, m1
pmulld m0, m4
pmulld m1, m5
REPX {paddd x, m13}, m2, m3, m0, m1
REPX {psrad x, 12 }, m2, m3, m0, m1
; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47
; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49
; end of step 1-2
vbroadcasti32x4 m10, [o(pd_401_1931)]
vbroadcasti32x4 m11, [o(pd_4076_3612)]
mova m4, m0
mova m5, m2
ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11
vbroadcasti32x4 m10, [o(pd_3166_3920)]
vbroadcasti32x4 m11, [o(pd_2598_1189)]
mova m6, m3
mova m7, m1
ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2
; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54
; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50
; and from earlier:
; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a
; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a
; end of step 3-4
punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34
punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38
punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42
punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46
punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50
punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54
punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58
punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62
mova m0, m22
mova m7, m21
mova m3, m18
mova m16, m17
mova m5, m6
mova m4, m19
mova m2, m8
mova m1, m23
; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a]
; step5
vpbroadcastd m10, [o(pd_799)]
vpbroadcastd m11, [o(pd_4017)]
ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a
ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a
vpbroadcastd m10, [o(pd_3406)]
vpbroadcastd m11, [o(pd_2276)]
ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a
ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a
; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a]
; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a]
; step6
psubd m20, m0, m21 ; t39/38a
paddd m0, m21 ; t32/33a
psubd m21, m1, m7 ; t36a/37
paddd m1, m7 ; t35a/34
REPX {pmaxsd x, m14}, m20, m0, m21, m1
psubd m7, m16, m18 ; t40/41a
paddd m16, m18 ; t47/46a
REPX {pminsd x, m15}, m20, m0, m21, m1
psubd m18, m17, m19 ; t43a/42
paddd m17, m19 ; t44a/45
REPX {pmaxsd x, m14}, m7, m16, m18, m17
psubd m19, m6, m4 ; t55/54a
paddd m6, m4 ; t48/49a
REPX {pminsd x, m15}, m7, m16, m18, m17
psubd m4, m5, m3 ; t52a/53
paddd m5, m3 ; t51a/50
REPX {pmaxsd x, m14}, m19, m6, m4, m5
psubd m3, m23, m2 ; t56/57a
paddd m23, m2 ; t63/62a
REPX {pminsd x, m15}, m19, m6, m4, m5
psubd m2, m22, m8 ; t59a/58
paddd m22, m8 ; t60a/61
REPX {pmaxsd x, m14}, m3, m23, m2, m22
REPX {pminsd x, m15}, m3, m23, m2, m22
; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a]
; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a]
; step7
vpbroadcastd m10, [o(pd_1567)]
vpbroadcastd m11, [o(pd_3784)]
ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a
ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57
ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41
ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a
; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a]
; step8
psubd m8, m0, m16 ; t47a/46
paddd m0, m16 ; t32a/33
psubd m16, m1, m17 ; t44/45a
paddd m1, m17 ; t35/34a
REPX {pmaxsd x, m14}, m8, m0, m16, m1
psubd m17, m2, m18 ; t43a/42
paddd m2, m18 ; t36a/37
REPX {pminsd x, m15}, m8, m0, m16, m1
psubd m18, m3, m7 ; t40/41a
paddd m3, m7 ; t39/38a
REPX {pmaxsd x, m14}, m17, m2, m18, m3
psubd m7, m23, m6 ; t48a/49
paddd m23, m6 ; t63a/62
REPX {pminsd x, m15}, m17, m2, m18, m3
psubd m6, m22, m5 ; t51/50a
paddd m22, m5 ; t60/61a
REPX {pmaxsd x, m14}, m7, m23, m6, m22
psubd m5, m21, m4 ; t52a/53
paddd m21, m4 ; t59a/58
REPX {pminsd x, m15}, m7, m23, m6, m22
psubd m4, m20, m19 ; t55/54a
paddd m20, m19 ; t56/57a
REPX {pmaxsd x, m14}, m5, m21, m4, m20
REPX {pminsd x, m15}, m5, m21, m4, m20
; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a]
; step9
REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8
REPX {paddd x, m13}, m4, m5, m6, m7
paddd m19, m4, m18 ; t55a/54
psubd m4, m18 ; t40a/41
paddd m18, m5, m17 ; t52/53a
psubd m5, m17 ; t43/42a
paddd m17, m6, m16 ; t51a/50
psubd m6, m16 ; t44a/45
paddd m16, m7, m8 ; t48/49a
psubd m7, m8 ; t47/46a
REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7
; m4-7=t40-47[a], m16-19=t48-55[a]
ret
%endif ; ARCH_X86_64