mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
x86: add AVX512-IceLake implementation of HBD 64x16 DCT^2
inv_txfm_add_64x16_dct_dct_0_10bpc_c: 892.0 ( 1.00x) inv_txfm_add_64x16_dct_dct_0_10bpc_sse4: 131.5 ( 6.78x) inv_txfm_add_64x16_dct_dct_0_10bpc_avx2: 63.4 (14.07x) inv_txfm_add_64x16_dct_dct_0_10bpc_avx512icl: 56.8 (15.71x) inv_txfm_add_64x16_dct_dct_1_10bpc_c: 29253.7 ( 1.00x) inv_txfm_add_64x16_dct_dct_1_10bpc_sse4: 1639.7 (17.84x) inv_txfm_add_64x16_dct_dct_1_10bpc_avx2: 1106.8 (26.43x) inv_txfm_add_64x16_dct_dct_1_10bpc_avx512icl: 532.9 (54.89x) inv_txfm_add_64x16_dct_dct_2_10bpc_c: 29249.8 ( 1.00x) inv_txfm_add_64x16_dct_dct_2_10bpc_sse4: 3065.6 ( 9.54x) inv_txfm_add_64x16_dct_dct_2_10bpc_avx2: 1791.0 (16.33x) inv_txfm_add_64x16_dct_dct_2_10bpc_avx512icl: 1108.0 (26.40x) inv_txfm_add_64x16_dct_dct_3_10bpc_c: 29269.1 ( 1.00x) inv_txfm_add_64x16_dct_dct_3_10bpc_sse4: 3738.2 ( 7.83x) inv_txfm_add_64x16_dct_dct_3_10bpc_avx2: 1790.9 (16.34x) inv_txfm_add_64x16_dct_dct_3_10bpc_avx512icl: 1203.8 (24.31x) inv_txfm_add_64x16_dct_dct_4_10bpc_c: 29337.7 ( 1.00x) inv_txfm_add_64x16_dct_dct_4_10bpc_sse4: 3749.7 ( 7.82x) inv_txfm_add_64x16_dct_dct_4_10bpc_avx2: 1791.0 (16.38x) inv_txfm_add_64x16_dct_dct_4_10bpc_avx512icl: 1203.8 (24.37x)
This commit is contained in:
@@ -358,6 +358,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
|
||||
assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
|
||||
assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
|
||||
assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
|
||||
assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -98,7 +98,7 @@ clip_18b_max: dd 0x1ffff
|
||||
clip_20b_min: dd -0x80000
|
||||
clip_20b_max: dd 0x7ffff
|
||||
|
||||
idct64_mul_16bpc:
|
||||
const idct64_mul_16bpc
|
||||
dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
|
||||
dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
|
||||
dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
|
||||
|
||||
+810
-16
@@ -75,7 +75,7 @@ pw_2048_m2048: times 16 dw 2048
|
||||
pw_m2048_2048: times 16 dw -2048
|
||||
pw_2048: times 16 dw 2048
|
||||
|
||||
; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-
|
||||
; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=--
|
||||
%macro COEF_PAIR 2-3 0 ; a, b, flags
|
||||
%if %3 == 1
|
||||
pd_%1_m%2: dd %1, %1, -%2, -%2
|
||||
@@ -85,6 +85,10 @@ pd_%1_m%2: dd %1, %1, -%2, -%2
|
||||
pd_m%1_%2: dd -%1, -%1, %2, %2
|
||||
%define pd_m%1 (pd_m%1_%2 + 4*0)
|
||||
%define pd_%2 (pd_m%1_%2 + 4*2)
|
||||
%elif %3 == 4
|
||||
pd_m%1_m%2: dd -%1, -%1, -%2, -%2
|
||||
%define pd_m%1 (pd_m%1_m%2 + 4*0)
|
||||
%define pd_m%2 (pd_m%1_m%2 + 4*2)
|
||||
%else
|
||||
pd_%1_%2: dd %1, %1, %2, %2
|
||||
%define pd_%1 (pd_%1_%2 + 4*0)
|
||||
@@ -96,10 +100,14 @@ dd -%2, -%2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
COEF_PAIR 101, 501
|
||||
COEF_PAIR 201, 601, 1
|
||||
COEF_PAIR 201, 995
|
||||
COEF_PAIR 401, 1189, 1
|
||||
COEF_PAIR 401, 1931
|
||||
COEF_PAIR 401, 3920
|
||||
COEF_PAIR 401, 4076
|
||||
COEF_PAIR 700, 301, 4
|
||||
COEF_PAIR 799, 2276, 1
|
||||
COEF_PAIR 799, 3406
|
||||
COEF_PAIR 799, 4017
|
||||
@@ -119,10 +127,13 @@ COEF_PAIR 3703, 3290
|
||||
COEF_PAIR 3857, 4052
|
||||
COEF_PAIR 4017, 2276
|
||||
COEF_PAIR 4017, 3406
|
||||
COEF_PAIR 4036, 4085
|
||||
COEF_PAIR 4076, 1189
|
||||
COEF_PAIR 4076, 3612
|
||||
COEF_PAIR 4076, 3920
|
||||
COEF_PAIR 4091, 3973
|
||||
COEF_PAIR 4091, 4052
|
||||
COEF_PAIR 4095, 4065
|
||||
|
||||
pb_32: times 4 db 32
|
||||
pw_5: times 2 dw 5
|
||||
@@ -146,6 +157,7 @@ pd_5793: dd 5793
|
||||
|
||||
cextern dup16_perm
|
||||
cextern int8_permA
|
||||
cextern idct64_mul_16bpc
|
||||
cextern idct_8x8_internal_8bpc_avx512icl.main
|
||||
cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
|
||||
cextern idct_8x16_internal_8bpc_avx512icl.main
|
||||
@@ -679,6 +691,20 @@ ALIGN function_align
|
||||
REPX {psrad x, 12 }, m4, m5, m6, m7
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_fast2:
|
||||
pmulld m0, m12
|
||||
pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a
|
||||
pmulld m8, m1, [o(pd_799)] {1to16} ; t4a
|
||||
REPX {paddd x, m13}, m0, m6, m8
|
||||
REPX {psrad x, 12 }, m0, m6, m8
|
||||
pmulld m5, m6, m12
|
||||
pmulld m1, m8, m12
|
||||
paddd m5, m13
|
||||
psubd m4, m5, m1
|
||||
paddd m5, m1
|
||||
REPX {psrad x, 12 }, m4, m5
|
||||
REPX {mova x, m0 }, m1, m2, m3
|
||||
ret
|
||||
.main_fast_rect2:
|
||||
REPX {paddd x, m13}, m0, m1, m2, m3
|
||||
REPX {psrad x, 12 }, m0, m1, m2, m3
|
||||
@@ -1557,6 +1583,20 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
||||
psrlq m9, m8, 8
|
||||
jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
|
||||
ALIGN function_align
|
||||
.main_fast2:
|
||||
pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a
|
||||
pmulld m9, m16, [o(pd_401)] {1to16} ; t8a
|
||||
pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a
|
||||
pmulld m17, [o(pd_3920)] {1to16} ; t12a
|
||||
psubd m18, m13, m18
|
||||
REPX {paddd x, m13}, m22, m9, m17
|
||||
REPX {psrad x, 12 }, m18, m22, m9, m17
|
||||
|
||||
mova m20, m9
|
||||
mova m16, m18
|
||||
mova m23, m22
|
||||
mova m19, m17
|
||||
jmp .main3
|
||||
.main_fast_rect2:
|
||||
REPX {paddd x, m13}, m16, m17, m18, m19
|
||||
REPX {psrad x, 12 }, m16, m17, m18, m19
|
||||
@@ -1590,14 +1630,15 @@ ALIGN function_align
|
||||
psubd m23, m19 ; t14
|
||||
psubd m19, m17, m21 ; t13
|
||||
paddd m17, m21 ; t12
|
||||
vpbroadcastd m11, [o(pd_3784)]
|
||||
REPX {pmaxsd x, m14}, m20, m23, m16, m19
|
||||
vpbroadcastd m10, [o(pd_1567)]
|
||||
REPX {pminsd x, m15}, m20, m23, m16, m19
|
||||
ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
|
||||
ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
|
||||
REPX {pmaxsd x, m14}, m9, m18, m22, m17
|
||||
REPX {pminsd x, m15}, m9, m18, m22, m17
|
||||
.main3:
|
||||
vpbroadcastd m11, [o(pd_3784)]
|
||||
vpbroadcastd m10, [o(pd_1567)]
|
||||
ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
|
||||
ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
|
||||
paddd m21, m20, m19 ; t14
|
||||
psubd m20, m19 ; t13
|
||||
psubd m19, m9, m18 ; t11a
|
||||
@@ -2441,6 +2482,80 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
|
||||
sar r6d, 10
|
||||
jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
|
||||
ALIGN function_align
|
||||
.main_fast3:
|
||||
; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3
|
||||
vbroadcasti32x4 m5, [o(pd_401_4076)]
|
||||
pmulld m3, m0, m12
|
||||
pmulld m4, m5
|
||||
REPX {paddd x, m13}, m3, m4
|
||||
REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a
|
||||
|
||||
; t8a t15a -> t8/9 t14/15
|
||||
|
||||
vbroadcasti32x4 m5, [o(pd_3784_m3784)]
|
||||
pshufd m7, m4, q1032
|
||||
pmulld m6, m4, [o(pd_1567)]{bcstd}
|
||||
pmulld m5, m7
|
||||
paddd m6, m13
|
||||
paddd m5, m6
|
||||
psrad m5, 12 ; m5=t9a t14a
|
||||
|
||||
; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4]
|
||||
|
||||
shufps m6, m4, m5, q1032 ; t12 t13
|
||||
shufps m8, m4, m5, q3210 ; t11a t10
|
||||
pmulld m9, m6, m12
|
||||
pmulld m7, m8, m12
|
||||
paddd m9, m13
|
||||
paddd m5, m9, m7 ; t12 t13a
|
||||
psubd m4, m9, m7 ; t11 t10a
|
||||
REPX {psrad x, 12 }, m5, m4
|
||||
|
||||
psubd m7, m3, m6 ; dct16 out15 out14
|
||||
paddd m0, m3, m6 ; dct16 out0 out1
|
||||
psubd m6, m3, m5 ; dct16 out12 out13
|
||||
paddd m1, m3, m5 ; dct16 out3 out2
|
||||
psubd m5, m3, m4 ; dct16 out11 out10
|
||||
paddd m2, m3, m4 ; dct16 out4 out5
|
||||
psubd m4, m3, m8 ; dct16 out8 out9
|
||||
paddd m3, m8 ; dct16 out7 out6
|
||||
REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
|
||||
REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
|
||||
; idct32_bottomhalf
|
||||
vbroadcasti32x4 m18, [o(pd_201_m601)]
|
||||
vbroadcasti32x4 m19, [o(pd_4091_4052)]
|
||||
pmulld m17, m16, m19
|
||||
pmulld m16, m18
|
||||
REPX {paddd x, m13}, m17, m16
|
||||
REPX {psrad x, 12 }, m17, m16
|
||||
|
||||
; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2]
|
||||
|
||||
vbroadcasti32x4 m10, [o(pd_799_m2276)]
|
||||
vbroadcasti32x4 m11, [o(pd_4017_3406)]
|
||||
pmulld m18, m17, m10
|
||||
pmulld m19, m17, m11
|
||||
pmulld m8, m16, m11
|
||||
pmulld m9, m16, m10
|
||||
REPX {paddd x, m13}, m18, m19
|
||||
psubd m18, m8
|
||||
paddd m19, m9
|
||||
REPX {psrad x, 12 }, m18, m19
|
||||
|
||||
; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a
|
||||
; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26
|
||||
|
||||
punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26]
|
||||
punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18]
|
||||
punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21]
|
||||
punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30]
|
||||
mova m21, m23
|
||||
mova m18, m20
|
||||
mova m17, m22
|
||||
mova m19, m16
|
||||
|
||||
jmp .main4
|
||||
.main_fast2: ; bottom three-quarters are zero
|
||||
vbroadcasti32x4 m8, [o(pd_799_4017)]
|
||||
pmulld m8, m1 ; t4 t7
|
||||
@@ -2541,8 +2656,6 @@ ALIGN function_align
|
||||
punpckhqdq m23, m9 ; t27 t26a
|
||||
punpckhqdq m9, m17, m18 ; t24 t25a
|
||||
punpcklqdq m17, m18 ; t28 t29a
|
||||
vpbroadcastd m11, [o(pd_3784)]
|
||||
vpbroadcastd m10, [o(pd_1567)]
|
||||
psubd m18, m16, m20 ; t19a t18
|
||||
paddd m20, m16 ; t16a t17
|
||||
psubd m16, m19, m17 ; t28a t29
|
||||
@@ -2553,10 +2666,13 @@ ALIGN function_align
|
||||
paddd m23, m9 ; t24a t25
|
||||
REPX {pmaxsd x, m14}, m18, m16, m17, m21
|
||||
REPX {pminsd x, m15}, m16, m18, m21, m17
|
||||
ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
|
||||
ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
|
||||
REPX {pmaxsd x, m14}, m20, m22, m19, m23
|
||||
REPX {pminsd x, m15}, m20, m22, m19, m23
|
||||
.main4:
|
||||
vpbroadcastd m11, [o(pd_3784)]
|
||||
vpbroadcastd m10, [o(pd_1567)]
|
||||
ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
|
||||
ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
|
||||
paddd m9, m20, m22 ; t16 t17a
|
||||
psubd m20, m22 ; t23 t22a
|
||||
paddd m22, m19, m23 ; t31 t30a
|
||||
@@ -3174,21 +3290,22 @@ ALIGN function_align
|
||||
%endmacro
|
||||
IDCT32_PASS1_END 0, 23 ; 0 16, 15 31
|
||||
IDCT32_PASS1_END 7, 16 ; 7 23, 8 24
|
||||
IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
|
||||
IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
|
||||
IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
|
||||
IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
|
||||
IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
|
||||
IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
|
||||
.transpose_16x32:
|
||||
mova m14, m13
|
||||
vpermi2q m14, m0, m16
|
||||
vpermt2q m0, m12, m16
|
||||
IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
|
||||
IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
|
||||
mova m15, m13
|
||||
vpermi2q m15, m1, m17
|
||||
vpermt2q m1, m12, m17
|
||||
IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
|
||||
IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
|
||||
mova m16, m13
|
||||
vpermi2q m16, m2, m18
|
||||
vpermt2q m2, m12, m18
|
||||
IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
|
||||
IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
|
||||
mova m17, m13
|
||||
vpermi2q m17, m3, m19
|
||||
vpermt2q m3, m12, m19
|
||||
@@ -3263,6 +3380,27 @@ ALIGN function_align
|
||||
mova [cq+64*13], m17
|
||||
mova [cq+64*15], m16
|
||||
ret
|
||||
.main_fast2: ; bottom half is zero
|
||||
pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
|
||||
pmulld m0, [o(pd_201)] {1to16} ; t16a
|
||||
pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
|
||||
pmulld m3, [o(pd_3857)] {1to16} ; t28a
|
||||
pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a
|
||||
pmulld m2, [o(pd_995)] {1to16} ; t20a
|
||||
pmulld m6, m1, [o(pd_601)] {1to16} ; t23a
|
||||
pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a
|
||||
REPX {psubd x, m13, x}, m20, m6
|
||||
REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17
|
||||
REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17
|
||||
mova m8, m0
|
||||
mova m16, m23
|
||||
mova m7, m20
|
||||
mova m4, m3
|
||||
mova m19, m2
|
||||
mova m18, m21
|
||||
mova m5, m6
|
||||
mova m22, m17
|
||||
jmp .main3
|
||||
.main_fast_rect2:
|
||||
call m(idct_8x16_internal_10bpc).round
|
||||
.main_fast: ; bottom half is zero
|
||||
@@ -3323,9 +3461,10 @@ ALIGN function_align
|
||||
psubd m22, m1, m17 ; t25
|
||||
paddd m17, m1 ; t24
|
||||
REPX {pmaxsd x, m14}, m5, m6, m22, m17
|
||||
REPX {pminsd x, m15}, m5, m6, m22, m17
|
||||
.main3:
|
||||
vpbroadcastd m11, [o(pd_4017)]
|
||||
vpbroadcastd m10, [o(pd_799)]
|
||||
REPX {pminsd x, m15}, m5, m6, m22, m17
|
||||
ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a
|
||||
ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
|
||||
vpbroadcastd m11, [o(pd_2276)]
|
||||
@@ -4496,4 +4635,659 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
||||
mova [cq+128*15], m16
|
||||
ret
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
|
||||
%undef cmp
|
||||
lea r5, [o_base]
|
||||
test eobd, eobd
|
||||
jz .dconly
|
||||
|
||||
PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob
|
||||
%undef cmp
|
||||
vpbroadcastd m12, [o(pd_2896)]
|
||||
vpbroadcastd m13, [o(pd_2048)]
|
||||
vpbroadcastd m14, [o(clip_18b_min)]
|
||||
vpbroadcastd m15, [o(clip_18b_max)]
|
||||
cmp eobd, 36
|
||||
jl .fast ; 8x8
|
||||
cmp eobd, 151
|
||||
jge .full ; 16x16
|
||||
lea r4, [idct64_mul_16bpc]
|
||||
lea r6, [rsp+4*64]
|
||||
mova m0, [cq+64* 1]
|
||||
mova m3, [cq+64*15]
|
||||
call .main_part1_fast
|
||||
mova m0, [cq+64* 7]
|
||||
mova m3, [cq+64* 9]
|
||||
call .main_part1_fast
|
||||
mova m0, [cq+64* 5]
|
||||
mova m3, [cq+64*11]
|
||||
call .main_part1_fast
|
||||
mova m0, [cq+64* 3]
|
||||
mova m3, [cq+64*13]
|
||||
call .main_part1_fast
|
||||
call .main_part2
|
||||
mova m0, [cq+64* 0]
|
||||
mova m1, [cq+64* 8]
|
||||
mova m16, [cq+64* 4]
|
||||
mova m17, [cq+64*12]
|
||||
call m(idct_8x16_internal_10bpc).main_fast2
|
||||
call m(idct_16x16_internal_10bpc).main_fast2
|
||||
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
|
||||
call .pass1_load_spill
|
||||
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
|
||||
mov r6d, 12*8
|
||||
jmp .idct64_end
|
||||
.full:
|
||||
lea r4, [idct64_mul_16bpc]
|
||||
lea r6, [rsp+4*64]
|
||||
mova m0, [cq+64* 1]
|
||||
mova m1, [cq+64*31]
|
||||
mova m2, [cq+64*17]
|
||||
mova m3, [cq+64*15]
|
||||
call .main_part1
|
||||
mova m0, [cq+64* 7]
|
||||
mova m1, [cq+64*25]
|
||||
mova m2, [cq+64*23]
|
||||
mova m3, [cq+64* 9]
|
||||
call .main_part1
|
||||
mova m0, [cq+64* 5]
|
||||
mova m1, [cq+64*27]
|
||||
mova m2, [cq+64*21]
|
||||
mova m3, [cq+64*11]
|
||||
call .main_part1
|
||||
mova m0, [cq+64* 3]
|
||||
mova m1, [cq+64*29]
|
||||
mova m2, [cq+64*19]
|
||||
mova m3, [cq+64*13]
|
||||
call .main_part1
|
||||
call .main_part2
|
||||
mova m0, [cq+64* 0]
|
||||
mova m1, [cq+64* 8]
|
||||
mova m2, [cq+64*16]
|
||||
mova m3, [cq+64*24]
|
||||
mova m16, [cq+64* 4]
|
||||
mova m17, [cq+64*12]
|
||||
mova m18, [cq+64*20]
|
||||
mova m19, [cq+64*28]
|
||||
call m(idct_8x16_internal_10bpc).main_fast
|
||||
call m(idct_16x16_internal_10bpc).main_fast
|
||||
call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
|
||||
call .pass1_load_spill
|
||||
mova m4, [cq+64*18]
|
||||
mova m5, [cq+64*22]
|
||||
mova m6, [cq+64*26]
|
||||
mova m7, [cq+64*30]
|
||||
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
|
||||
mov r6d, 28*8
|
||||
jmp .idct64_end
|
||||
.dconly:
|
||||
imul r6d, [cq], 181
|
||||
mov [cq], eobd
|
||||
or r3d, 16
|
||||
add r6d, 640
|
||||
sar r6d, 10
|
||||
.dconly2:
|
||||
vpbroadcastd m3, [o(dconly_10bpc)]
|
||||
imul r6d, 181
|
||||
add r6d, 2176
|
||||
sar r6d, 12
|
||||
vpbroadcastw m2, r6d
|
||||
paddsw m2, m3
|
||||
.dconly_loop:
|
||||
paddsw m0, m2, [dstq+64*0]
|
||||
paddsw m1, m2, [dstq+64*1]
|
||||
psubusw m0, m3
|
||||
psubusw m1, m3
|
||||
mova [dstq+64*0], m0
|
||||
mova [dstq+64*1], m1
|
||||
add dstq, strideq
|
||||
dec r3d
|
||||
jg .dconly_loop
|
||||
ret
|
||||
.pass1_load_spill:
|
||||
mova [cq+64* 0], m0
|
||||
mova [cq+64* 1], m1
|
||||
mova m0, [cq+64* 2]
|
||||
mova m1, [cq+64* 6]
|
||||
mova [cq+64* 2], m2
|
||||
mova [cq+64* 3], m3
|
||||
mova [cq+64* 4], m4
|
||||
mova [cq+64* 5], m5
|
||||
mova [cq+64* 6], m6
|
||||
mova [cq+64* 7], m7
|
||||
mova m2, [cq+64*10]
|
||||
mova m3, [cq+64*14]
|
||||
mova [cq+64* 8], m23
|
||||
mova [cq+64* 9], m22
|
||||
mova [cq+64*10], m21
|
||||
mova [cq+64*11], m20
|
||||
mova [cq+64*12], m19
|
||||
mova [cq+64*13], m18
|
||||
mova [cq+64*14], m17
|
||||
mova [cq+64*15], m16
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_part1_fast:
|
||||
pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
|
||||
pmulld m0, [r4+4*1]{bcstd} ; t32a
|
||||
pmulld m4, m3, [r4+4*6]{bcstd} ; t60a
|
||||
pmulld m3, [r4+4*7]{bcstd} ; t35a
|
||||
vpbroadcastd m10, [r4+4*8]
|
||||
vpbroadcastd m11, [r4+4*9]
|
||||
REPX {paddd x, m13}, m7, m0, m4, m3
|
||||
REPX {psrad x, 12 }, m7, m0, m4, m3
|
||||
mova m8, m0
|
||||
mova m1, m7
|
||||
mova m6, m3
|
||||
mova m2, m4
|
||||
jmp .main_part1b
|
||||
.main_part1_rect2:
|
||||
REPX {paddd x, m13}, m0, m1, m2, m3
|
||||
REPX {psrad x, 12 }, m0, m1, m2, m3
|
||||
.main_part1: ; idct64 steps 1-5
|
||||
; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
|
||||
; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
|
||||
; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
|
||||
; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
|
||||
pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
|
||||
pmulld m0, [r4+4*1]{bcstd} ; t32a
|
||||
pmulld m6, m1, [r4+4*2]{bcstd} ; t62a
|
||||
pmulld m1, [r4+4*3]{bcstd} ; t33a
|
||||
pmulld m5, m2, [r4+4*4]{bcstd} ; t61a
|
||||
pmulld m2, [r4+4*5]{bcstd} ; t34a
|
||||
pmulld m4, m3, [r4+4*6]{bcstd} ; t60a
|
||||
pmulld m3, [r4+4*7]{bcstd} ; t35a
|
||||
vpbroadcastd m10, [r4+4*8]
|
||||
vpbroadcastd m11, [r4+4*9]
|
||||
REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3
|
||||
REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
|
||||
psubd m8, m0, m1 ; t33
|
||||
paddd m0, m1 ; t32
|
||||
psubd m1, m7, m6 ; t62
|
||||
paddd m7, m6 ; t63
|
||||
psubd m6, m3, m2 ; t34
|
||||
paddd m3, m2 ; t35
|
||||
psubd m2, m4, m5 ; t61
|
||||
paddd m4, m5 ; t60
|
||||
.main_part1b:
|
||||
REPX {pmaxsd x, m14}, m8, m1, m6, m2
|
||||
REPX {pminsd x, m15}, m8, m1, m6, m2
|
||||
ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a
|
||||
ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a
|
||||
REPX {pmaxsd x, m14}, m0, m3, m7, m4
|
||||
REPX {pminsd x, m15}, m0, m3, m7, m4
|
||||
vpbroadcastd m10, [r4+4*10]
|
||||
vpbroadcastd m11, [r4+4*11]
|
||||
psubd m5, m0, m3 ; t35a
|
||||
paddd m0, m3 ; t32a
|
||||
psubd m3, m7, m4 ; t60a
|
||||
paddd m7, m4 ; t63a
|
||||
psubd m4, m1, m6 ; t34
|
||||
paddd m1, m6 ; t33
|
||||
psubd m6, m8, m2 ; t61
|
||||
paddd m8, m2 ; t62
|
||||
REPX {pmaxsd x, m14}, m5, m3, m4, m6
|
||||
REPX {pminsd x, m15}, m5, m3, m4, m6
|
||||
ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60
|
||||
ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a
|
||||
REPX {pmaxsd x, m14}, m0, m7, m1, m8
|
||||
REPX {pminsd x, m15}, m0, m7, m1, m8
|
||||
add r4, 4*12
|
||||
mova [r6-64*4], m0
|
||||
mova [r6+64*3], m7
|
||||
mova [r6-64*3], m1
|
||||
mova [r6+64*2], m8
|
||||
mova [r6-64*2], m6
|
||||
mova [r6+64*1], m4
|
||||
mova [r6-64*1], m3
|
||||
mova [r6+64*0], m5
|
||||
add r6, 64*8
|
||||
ret
|
||||
.main_part2: ; idct64 steps 6-9
|
||||
lea r4, [r6+64*3]
|
||||
sub r6, 64*4
|
||||
vpbroadcastd m10, [pd_1567]
|
||||
vpbroadcastd m11, [pd_3784]
|
||||
.main_part2_loop:
|
||||
mova m0, [r6-64*32] ; t32a
|
||||
mova m1, [r4-64*24] ; t39a
|
||||
mova m2, [r4-64*32] ; t63a
|
||||
mova m3, [r6-64*24] ; t56a
|
||||
mova m4, [r6-64*16] ; t40a
|
||||
mova m5, [r4-64* 8] ; t47a
|
||||
mova m6, [r4-64*16] ; t55a
|
||||
mova m7, [r6-64* 8] ; t48a
|
||||
psubd m8, m0, m1 ; t39
|
||||
paddd m0, m1 ; t32
|
||||
psubd m1, m2, m3 ; t56
|
||||
paddd m2, m3 ; t63
|
||||
psubd m3, m5, m4 ; t40
|
||||
paddd m5, m4 ; t47
|
||||
psubd m4, m7, m6 ; t55
|
||||
paddd m7, m6 ; t48
|
||||
REPX {pmaxsd x, m14}, m8, m1, m3, m4
|
||||
REPX {pminsd x, m15}, m8, m1, m3, m4
|
||||
ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a
|
||||
ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a
|
||||
REPX {pmaxsd x, m14}, m0, m2, m5, m7
|
||||
REPX {pminsd x, m15}, m0, m5, m2, m7
|
||||
psubd m6, m2, m7 ; t48a
|
||||
paddd m2, m7 ; t63a
|
||||
psubd m7, m0, m5 ; t47a
|
||||
paddd m0, m5 ; t32a
|
||||
psubd m5, m8, m4 ; t55
|
||||
paddd m8, m4 ; t56
|
||||
psubd m4, m1, m3 ; t40
|
||||
paddd m1, m3 ; t39
|
||||
REPX {pmaxsd x, m14}, m6, m7, m5, m4
|
||||
REPX {pminsd x, m15}, m6, m7, m5, m4
|
||||
REPX {pmulld x, m12}, m6, m7, m5, m4
|
||||
REPX {pmaxsd x, m14}, m2, m0, m8, m1
|
||||
REPX {pminsd x, m15}, m2, m0, m8, m1
|
||||
paddd m6, m13
|
||||
paddd m5, m13
|
||||
psubd m3, m6, m7 ; t47
|
||||
paddd m6, m7 ; t48
|
||||
psubd m7, m5, m4 ; t40a
|
||||
paddd m5, m4 ; t55a
|
||||
REPX {psrad x, 12}, m3, m6, m7, m5
|
||||
mova [r4-64* 8], m2
|
||||
mova [r6-64*32], m0
|
||||
mova [r6-64* 8], m8
|
||||
mova [r4-64*32], m1
|
||||
mova [r4-64*24], m3
|
||||
mova [r6-64*16], m6
|
||||
mova [r6-64*24], m7
|
||||
mova [r4-64*16], m5
|
||||
add r6, 64
|
||||
sub r4, 64
|
||||
cmp r6, r4
|
||||
jl .main_part2_loop
|
||||
ret
|
||||
.idct64_end:
|
||||
%macro IDCT64_PASS1_END 8
|
||||
mova m%5, [cq+%1*64] ; t0+n [idct32] + idct64 rounding
|
||||
psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64]
|
||||
paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64]
|
||||
REPX {pmaxsd x, m14}, m%6, m%5
|
||||
REPX {pminsd x, m15}, m%6, m%5
|
||||
REPX {paddd x, m11}, m%6, m%5
|
||||
mova m%2, [rsp+%3*64] ; t32+n [idct64]
|
||||
mova m%7, [rsp+%4*64] ; t63-n [idct64]
|
||||
psubd m%8, m%5, m%7 ; out63-n
|
||||
paddd m%5, m%7 ; out0+n
|
||||
psubd m%7, m%6, m%2 ; out32+n
|
||||
paddd m%6, m%2 ; out31-n
|
||||
REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6
|
||||
%endmacro
|
||||
|
||||
%macro IDCT64_PASS1_ENDx4 1
|
||||
%assign %%m1 %1 ; t32+n
|
||||
%assign %%m2 (7-%1) ; t39-n
|
||||
%assign %%m3 (8+%1) ; t40+n
|
||||
%assign %%m4 (15-%1) ; t47-n
|
||||
%assign %%m5 (16+%1) ; t48+n
|
||||
%assign %%m6 (23-%1) ; t55-n
|
||||
%assign %%m7 (24+%1) ; t56+n
|
||||
%assign %%m8 (31-%1) ; t63-n
|
||||
|
||||
%assign %%r1 %1 ; t16+n
|
||||
%assign %%r2 (7-%1) ; t23-n
|
||||
%assign %%r3 (16+%1) ; t24-n
|
||||
%assign %%r4 (23-%1) ; t31-n
|
||||
|
||||
%assign %%c1 (%1) ; t0+n
|
||||
%assign %%c2 (7-%1) ; t7-n
|
||||
%assign %%c3 (15-%1) ; t8+n
|
||||
%assign %%c4 (8+%1) ; t15-n
|
||||
|
||||
IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27 ; out0/31/32/63
|
||||
IDCT64_PASS1_END %%c4, %%r1, %%m4, %%m5, 28, 29, 30, 31 ; out15/16/47/48
|
||||
packssdw m %+ %%r1, m24, m29
|
||||
packssdw m %+ %%r4, m28, m25
|
||||
packssdw m26, m31
|
||||
packssdw m30, m27
|
||||
mova [rsp+%%m1*mmsize], m26
|
||||
mova [rsp+%%m4*mmsize], m30
|
||||
IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27 ; out7/24/39/56
|
||||
IDCT64_PASS1_END %%c3, %%r2, %%m3, %%m6, 28, 29, 30, 31 ; out8/23/40/55
|
||||
packssdw m %+ %%r2, m24, m29
|
||||
packssdw m %+ %%r3, m28, m25
|
||||
packssdw m26, m31
|
||||
packssdw m30, m27
|
||||
mova [rsp+%%m2*mmsize], m26
|
||||
mova [rsp+%%m3*mmsize], m30
|
||||
%endmacro
|
||||
|
||||
vpbroadcastd m11, [o(pd_2)]
|
||||
lea r5, [o_base_8bpc]
|
||||
IDCT64_PASS1_ENDx4 0
|
||||
IDCT64_PASS1_ENDx4 1
|
||||
IDCT64_PASS1_ENDx4 2
|
||||
IDCT64_PASS1_ENDx4 3
|
||||
|
||||
pxor m12, m12
|
||||
.zero_loop:
|
||||
REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3
|
||||
sub r6d, 8*4
|
||||
jge .zero_loop
|
||||
|
||||
lea r3, [strideq*3]
|
||||
mov r4, dstq
|
||||
call .pass2
|
||||
mova m0, [rsp+ 0*mmsize]
|
||||
mova m1, [rsp+ 1*mmsize]
|
||||
mova m2, [rsp+ 2*mmsize]
|
||||
mova m3, [rsp+ 3*mmsize]
|
||||
mova m4, [rsp+ 4*mmsize]
|
||||
mova m5, [rsp+ 5*mmsize]
|
||||
mova m6, [rsp+ 6*mmsize]
|
||||
mova m7, [rsp+ 7*mmsize]
|
||||
mova m16, [rsp+ 8*mmsize]
|
||||
mova m17, [rsp+ 9*mmsize]
|
||||
mova m18, [rsp+10*mmsize]
|
||||
mova m19, [rsp+11*mmsize]
|
||||
mova m20, [rsp+12*mmsize]
|
||||
mova m21, [rsp+13*mmsize]
|
||||
mova m22, [rsp+14*mmsize]
|
||||
mova m23, [rsp+15*mmsize]
|
||||
lea dstq, [r4+64]
|
||||
call .pass2
|
||||
RET
|
||||
.pass2:
|
||||
psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
|
||||
psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
|
||||
call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
|
||||
|
||||
punpckhqdq m19, m5, m16 ; 11
|
||||
punpcklqdq m5, m16 ; 10
|
||||
punpckhqdq m16, m2, m1 ; 5
|
||||
punpcklqdq m2, m1 ; 4
|
||||
punpcklqdq m1, m15, m4 ; 2
|
||||
punpckhqdq m15, m4 ; 3
|
||||
punpcklqdq m4, m14, m18 ; 8
|
||||
punpckhqdq m18, m14, m18 ; 9
|
||||
punpckhqdq m14, m0, m20 ; 1
|
||||
punpcklqdq m0, m20 ; 0
|
||||
punpckhqdq m20, m6, m17 ; 13
|
||||
punpcklqdq m6, m17 ; 12
|
||||
punpckhqdq m17, m3, m21 ; 7
|
||||
punpcklqdq m3, m21 ; 6
|
||||
punpckhqdq m21, m7, m8 ; 15
|
||||
punpcklqdq m7, m8 ; 14
|
||||
|
||||
call m(inv_txfm_add_dct_dct_32x8_8bpc).main
|
||||
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
|
||||
.write:
|
||||
vpbroadcastd m11, [pw_2048]
|
||||
pxor m12, m12
|
||||
vpbroadcastd m13, [pixel_10bpc_max]
|
||||
call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
|
||||
pmulhrsw m0, m11, m14
|
||||
pmulhrsw m1, m11, m15
|
||||
pmulhrsw m2, m11, m16
|
||||
pmulhrsw m3, m11, m17
|
||||
call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
|
||||
pmulhrsw m0, m11, m18
|
||||
pmulhrsw m1, m11, m19
|
||||
pmulhrsw m2, m11, m20
|
||||
pmulhrsw m3, m11, m21
|
||||
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
|
||||
.fast: ; 8x8 packed
|
||||
movshdup m7, [o(permB)]
|
||||
mova ym0, [cq+64*1]
|
||||
mova ym2, [cq+64*5]
|
||||
mova ym3, [cq+64*3]
|
||||
mova ym1, [cq+64*7]
|
||||
vpermt2q m0, m7, m2 ; 1 5
|
||||
vpermt2q m1, m7, m3 ; 7 3
|
||||
call .main_oddhalf_packed
|
||||
mova [rsp+ 0*mmsize], m0
|
||||
mova [rsp+ 1*mmsize], m1
|
||||
mova [rsp+ 2*mmsize], m2
|
||||
mova [rsp+ 3*mmsize], m3
|
||||
mova [rsp+ 4*mmsize], m4
|
||||
mova [rsp+ 5*mmsize], m5
|
||||
mova [rsp+ 6*mmsize], m6
|
||||
mova [rsp+ 7*mmsize], m7
|
||||
mova [rsp+ 8*mmsize], m16
|
||||
mova [rsp+ 9*mmsize], m17
|
||||
mova [rsp+10*mmsize], m18
|
||||
mova [rsp+11*mmsize], m19
|
||||
mova [rsp+12*mmsize], m20
|
||||
mova [rsp+13*mmsize], m21
|
||||
mova [rsp+14*mmsize], m22
|
||||
mova [rsp+15*mmsize], m23
|
||||
|
||||
movshdup m7, [o(permB)]
|
||||
mova ym0, [cq+64*0]
|
||||
mova ym4, [cq+64*4]
|
||||
mova ym16, [cq+64*2]
|
||||
mova ym5, [cq+64*6]
|
||||
vpermt2q m16, m7, m5 ; 2 6
|
||||
vpermq m0, m7, m0 ; 0 0
|
||||
vpermq m4, m7, m4 ; 4 4
|
||||
call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
|
||||
; m0-7,9,16-22 contain un-sumsub'ed dct32 output data
|
||||
|
||||
; zero input coefs
|
||||
pxor m12, m12
|
||||
REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
vpbroadcastd m11, [o(pd_2)]
|
||||
call .main_end
|
||||
lea r3, [strideq*3]
|
||||
mov r4, dstq
|
||||
call .pass2_fast
|
||||
mova m0, m24
|
||||
mova m1, m25
|
||||
mova m2, m26
|
||||
mova m3, m27
|
||||
mova m4, m28
|
||||
mova m5, m29
|
||||
mova m6, m30
|
||||
mova m7, m31
|
||||
lea dstq, [r4+64]
|
||||
lea r5, [o_base]
|
||||
call .pass2_fast
|
||||
RET
|
||||
.pass2_fast:
|
||||
call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
|
||||
lea r5, [o_base_8bpc]
|
||||
punpckhqdq m14, m0, m2 ; 1
|
||||
punpcklqdq m0, m2 ; 0
|
||||
punpcklqdq m1, m3, m4 ; 2
|
||||
punpckhqdq m15, m3, m4 ; 3
|
||||
punpcklqdq m2, m5, m7 ; 4
|
||||
punpckhqdq m16, m5, m7 ; 5
|
||||
punpcklqdq m3, m6, m8 ; 6
|
||||
punpckhqdq m17, m6, m8 ; 7
|
||||
call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
|
||||
jmp .write
|
||||
.main_end:
|
||||
|
||||
%macro IDCT64_PASS1_PACKED_END 7
|
||||
psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64]
|
||||
paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64]
|
||||
REPX {pmaxsd x, m14}, m%5, m%1
|
||||
REPX {pminsd x, m15}, m%5, m%1
|
||||
REPX {paddd x, m11}, m%5, m%1
|
||||
mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64]
|
||||
mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64]
|
||||
psubd m%4, m%1, m%3 ; out63-n
|
||||
paddd m%1, m%3 ; out0+n
|
||||
psubd m%3, m%5, m%2 ; out32+n
|
||||
paddd m%2, m%5 ; out31-n
|
||||
REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2
|
||||
%endmacro
|
||||
|
||||
IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62
|
||||
IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49
|
||||
packssdw m0, m9
|
||||
packssdw m7, m22
|
||||
packssdw m24, m13
|
||||
packssdw m31, m10
|
||||
IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61
|
||||
IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50
|
||||
packssdw m1, m16
|
||||
packssdw m6, m21
|
||||
packssdw m25, m13
|
||||
packssdw m30, m10
|
||||
IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58
|
||||
IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53
|
||||
packssdw m2, m17
|
||||
packssdw m5, m20
|
||||
packssdw m26, m13
|
||||
packssdw m29, m10
|
||||
IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57
|
||||
IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54
|
||||
packssdw m3, m18
|
||||
packssdw m4, m19
|
||||
packssdw m27, m13
|
||||
packssdw m28, m10
|
||||
ret
|
||||
.main_oddhalf_packed:
|
||||
; m0=in1 in5, m1=in7 in3
|
||||
vbroadcasti32x4 m2, [o(pd_101_501)]
|
||||
vbroadcasti32x4 m3, [o(pd_m700_m301)]
|
||||
vbroadcasti32x4 m4, [o(pd_4095_4065)]
|
||||
vbroadcasti32x4 m5, [o(pd_4036_4085)]
|
||||
pmulld m2, m0
|
||||
pmulld m3, m1
|
||||
pmulld m0, m4
|
||||
pmulld m1, m5
|
||||
REPX {paddd x, m13}, m2, m3, m0, m1
|
||||
REPX {psrad x, 12 }, m2, m3, m0, m1
|
||||
|
||||
; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47
|
||||
; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49
|
||||
; end of step 1-2
|
||||
|
||||
vbroadcasti32x4 m10, [o(pd_401_1931)]
|
||||
vbroadcasti32x4 m11, [o(pd_4076_3612)]
|
||||
mova m4, m0
|
||||
mova m5, m2
|
||||
ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11
|
||||
vbroadcasti32x4 m10, [o(pd_3166_3920)]
|
||||
vbroadcasti32x4 m11, [o(pd_2598_1189)]
|
||||
mova m6, m3
|
||||
mova m7, m1
|
||||
ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2
|
||||
|
||||
; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54
|
||||
; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50
|
||||
; and from earlier:
|
||||
; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a
|
||||
; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a
|
||||
; end of step 3-4
|
||||
|
||||
punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34
|
||||
punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38
|
||||
punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42
|
||||
punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46
|
||||
punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50
|
||||
punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54
|
||||
punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58
|
||||
punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62
|
||||
mova m0, m22
|
||||
mova m7, m21
|
||||
mova m3, m18
|
||||
mova m16, m17
|
||||
mova m5, m6
|
||||
mova m4, m19
|
||||
mova m2, m8
|
||||
mova m1, m23
|
||||
; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a]
|
||||
|
||||
; step5
|
||||
vpbroadcastd m10, [o(pd_799)]
|
||||
vpbroadcastd m11, [o(pd_4017)]
|
||||
ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a
|
||||
ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a
|
||||
vpbroadcastd m10, [o(pd_3406)]
|
||||
vpbroadcastd m11, [o(pd_2276)]
|
||||
ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a
|
||||
ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a
|
||||
; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a]
|
||||
; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a]
|
||||
|
||||
; step6
|
||||
psubd m20, m0, m21 ; t39/38a
|
||||
paddd m0, m21 ; t32/33a
|
||||
psubd m21, m1, m7 ; t36a/37
|
||||
paddd m1, m7 ; t35a/34
|
||||
REPX {pmaxsd x, m14}, m20, m0, m21, m1
|
||||
psubd m7, m16, m18 ; t40/41a
|
||||
paddd m16, m18 ; t47/46a
|
||||
REPX {pminsd x, m15}, m20, m0, m21, m1
|
||||
psubd m18, m17, m19 ; t43a/42
|
||||
paddd m17, m19 ; t44a/45
|
||||
REPX {pmaxsd x, m14}, m7, m16, m18, m17
|
||||
psubd m19, m6, m4 ; t55/54a
|
||||
paddd m6, m4 ; t48/49a
|
||||
REPX {pminsd x, m15}, m7, m16, m18, m17
|
||||
psubd m4, m5, m3 ; t52a/53
|
||||
paddd m5, m3 ; t51a/50
|
||||
REPX {pmaxsd x, m14}, m19, m6, m4, m5
|
||||
psubd m3, m23, m2 ; t56/57a
|
||||
paddd m23, m2 ; t63/62a
|
||||
REPX {pminsd x, m15}, m19, m6, m4, m5
|
||||
psubd m2, m22, m8 ; t59a/58
|
||||
paddd m22, m8 ; t60a/61
|
||||
REPX {pmaxsd x, m14}, m3, m23, m2, m22
|
||||
REPX {pminsd x, m15}, m3, m23, m2, m22
|
||||
; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a]
|
||||
; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a]
|
||||
|
||||
; step7
|
||||
vpbroadcastd m10, [o(pd_1567)]
|
||||
vpbroadcastd m11, [o(pd_3784)]
|
||||
ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a
|
||||
ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57
|
||||
ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41
|
||||
ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a
|
||||
; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a]
|
||||
|
||||
; step8
|
||||
psubd m8, m0, m16 ; t47a/46
|
||||
paddd m0, m16 ; t32a/33
|
||||
psubd m16, m1, m17 ; t44/45a
|
||||
paddd m1, m17 ; t35/34a
|
||||
REPX {pmaxsd x, m14}, m8, m0, m16, m1
|
||||
psubd m17, m2, m18 ; t43a/42
|
||||
paddd m2, m18 ; t36a/37
|
||||
REPX {pminsd x, m15}, m8, m0, m16, m1
|
||||
psubd m18, m3, m7 ; t40/41a
|
||||
paddd m3, m7 ; t39/38a
|
||||
REPX {pmaxsd x, m14}, m17, m2, m18, m3
|
||||
psubd m7, m23, m6 ; t48a/49
|
||||
paddd m23, m6 ; t63a/62
|
||||
REPX {pminsd x, m15}, m17, m2, m18, m3
|
||||
psubd m6, m22, m5 ; t51/50a
|
||||
paddd m22, m5 ; t60/61a
|
||||
REPX {pmaxsd x, m14}, m7, m23, m6, m22
|
||||
psubd m5, m21, m4 ; t52a/53
|
||||
paddd m21, m4 ; t59a/58
|
||||
REPX {pminsd x, m15}, m7, m23, m6, m22
|
||||
psubd m4, m20, m19 ; t55/54a
|
||||
paddd m20, m19 ; t56/57a
|
||||
REPX {pmaxsd x, m14}, m5, m21, m4, m20
|
||||
REPX {pminsd x, m15}, m5, m21, m4, m20
|
||||
; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a]
|
||||
|
||||
; step9
|
||||
REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8
|
||||
REPX {paddd x, m13}, m4, m5, m6, m7
|
||||
paddd m19, m4, m18 ; t55a/54
|
||||
psubd m4, m18 ; t40a/41
|
||||
paddd m18, m5, m17 ; t52/53a
|
||||
psubd m5, m17 ; t43/42a
|
||||
paddd m17, m6, m16 ; t51a/50
|
||||
psubd m6, m16 ; t44a/45
|
||||
paddd m16, m7, m8 ; t48/49a
|
||||
psubd m7, m8 ; t47/46a
|
||||
REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7
|
||||
; m4-7=t40-47[a], m16-19=t48-55[a]
|
||||
ret
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
Reference in New Issue
Block a user