x86/itx: add eob-based fast path to 4x16 hbd/sse4 itx

2026-06-11 04:03:05 +00:00 · 2021-07-28 09:10:14 -04:00
parent e7228e8013
commit a804d43004
1 changed files with 99 additions and 31 deletions
@@ -103,6 +103,10 @@ cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
 cextern idct_8x4_internal_8bpc_ssse3.main
 cextern iadst_8x4_internal_8bpc_ssse3.main

+tbl_4x16_2d: db 0, 13, 29, 45
+tbl_4x16_h: db 0, 16, 32, 48
+tbl_4x16_v: db 0, 4, 8, 12
+
 SECTION .text

 %macro REPX 2-*
@@ -264,8 +268,12 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
    jz %%end
 %endif
    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
 %if %3
    add                eobd, %3
+%endif
+%else
+    lea                  r5, [o(%3)]
 %endif
    call %%p1
    RET
@@ -274,13 +282,17 @@ cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
    ; Jump to the 1st txfm function if we're not taking the fast path, which
    ; in turn performs an indirect jump to the 2nd txfm function.
    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+    add                eobd, %3
+%endif
+%else
+    lea                  r5, [o(%3)]
+%endif
 %ifidn %1_%2, dct_dct
    test               eobd, eobd
    jnz %%p1
 %else
-%if %3
-    add                eobd, %3
-%endif
    ; jump to the 1st txfm function unless it's located directly after this
    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
 ALIGN function_align
@@ -875,8 +887,8 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    mova                 m4, [o(pw_4096)]
    jmp m(idct_4x8_internal_16bpc).end

-%macro INV_TXFM_4X16_FN 2 ; type1, type2
-    INV_TXFM_FN          %1, %2, 0, 4x16
+%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2
+    INV_TXFM_FN          %1, %2, tbl_4x16_%3, 4x16
 %ifidn %1_%2, dct_dct
    imul                r5d, [cq], 2896
    mov                [cq], eobd ; 0
@@ -888,18 +900,32 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
 %endmacro

 INV_TXFM_4X16_FN dct, dct
-INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, identity, v
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst

 cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+    mov                 r5m, r6d
+%endif
+    mov                 r6d, 4
+.zero_loop:
+    dec                 r6d
+    cmp                eobb, byte [r5+r6]
+    jl .zero_loop
+    mov                 r5d, r6d
+    shl                 r5d, 4
+%if ARCH_X86_32
+    ; restore pic-ptr
+    mov                  r6, r5m
+%endif
    mova                 m5, [o(pd_2048)]
-    mov                 r3d, 48
 .loop_pass1:
-    mova                 m0, [cq+64*0+r3]
-    mova                 m1, [cq+64*1+r3]
-    mova                 m2, [cq+64*2+r3]
-    mova                 m3, [cq+64*3+r3]
+    mova                 m0, [cq+64*0+r5]
+    mova                 m1, [cq+64*1+r5]
+    mova                 m2, [cq+64*2+r5]
+    mova                 m3, [cq+64*3+r5]
    call m(idct_4x4_internal_16bpc).pass1_main
    pcmpeqd              m3, m3
    REPX      {psubd x, m3}, m0, m1, m4, m2
@@ -910,11 +936,11 @@ cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    punpcklwd            m0, m4
    punpckhwd            m1, m0, m2
    punpcklwd            m0, m2
-    test                r3d, r3d
+    test                r5d, r5d
    jz .end_pass1
-    mova       [cq+64*0+r3], m0
-    mova       [cq+64*1+r3], m1
-    sub                 r3d, 16
+    mova       [cq+64*0+r5], m0
+    mova       [cq+64*1+r5], m1
+    sub                 r5d, 16
    jmp .loop_pass1
 .end_pass1:
    mova                 m2, [cq+64*0+16]
@@ -985,10 +1011,24 @@ cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
 INV_TXFM_4X16_FN adst, dct
 INV_TXFM_4X16_FN adst, adst
 INV_TXFM_4X16_FN adst, flipadst
-INV_TXFM_4X16_FN adst, identity
+INV_TXFM_4X16_FN adst, identity, v

 cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
-    mov                 r5d, 48
+%undef cmp
+%if ARCH_X86_32
+    mov                 r5m, r6d
+%endif
+    mov                 r6d, 4
+.zero_loop:
+    dec                 r6d
+    cmp                eobb, byte [r6+r5]
+    jl .zero_loop
+    mov                 r5d, r6d
+    shl                 r5d, 4
+%if ARCH_X86_32
+    ; restore pic-ptr
+    mov                  r6, r5m
+%endif
 .loop_pass1:
    mova                 m5, [cq+64*0+r5]
    lea                  r3, [cq+64*1+r5]
@@ -1044,10 +1084,24 @@ cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
 INV_TXFM_4X16_FN flipadst, dct
 INV_TXFM_4X16_FN flipadst, adst
 INV_TXFM_4X16_FN flipadst, flipadst
-INV_TXFM_4X16_FN flipadst, identity
+INV_TXFM_4X16_FN flipadst, identity, v

 cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
-    mov                 r5d, 48
+%undef cmp
+%if ARCH_X86_32
+    mov                 r5m, r6d
+%endif
+    mov                 r6d, 4
+.zero_loop:
+    dec                 r6d
+    cmp                eobb, byte [r5+r6]
+    jl .zero_loop
+    mov                 r5d, r6d
+    shl                 r5d, 4
+%if ARCH_X86_32
+    ; restore pic-ptr
+    mov                  r6, r5m
+%endif
 .loop_pass1:
    mova                 m5, [cq+64*0+r5]
    lea                  r3, [cq+64*1+r5]
@@ -1100,20 +1154,34 @@ cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    punpckhqdq           m2, m7, m5
    jmp m(idct_4x16_internal_16bpc).end

-INV_TXFM_4X16_FN identity, dct
-INV_TXFM_4X16_FN identity, adst
-INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, dct, h
+INV_TXFM_4X16_FN identity, adst, h
+INV_TXFM_4X16_FN identity, flipadst, h
 INV_TXFM_4X16_FN identity, identity

 cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+    mov                 r5m, r6d
+%endif
+    mov                 r6d, 4
+.zero_loop:
+    dec                 r6d
+    cmp                eobb, byte [r5+r6]
+    jl .zero_loop
+    mov                 r5d, r6d
+    shl                 r5d, 4
+%if ARCH_X86_32
+    ; restore pic-ptr
+    mov                  r6, r5m
+%endif
    mova                 m5, [o(pd_6144)]
    mova                 m4, [o(pd_5793)]
-    mov                 r3d, 48
 .loop_pass1:
-    pmulld               m0, m4, [cq+64*0+r3]
-    pmulld               m1, m4, [cq+64*1+r3]
-    pmulld               m2, m4, [cq+64*2+r3]
-    pmulld               m3, m4, [cq+64*3+r3]
+    pmulld               m0, m4, [cq+64*0+r5]
+    pmulld               m1, m4, [cq+64*1+r5]
+    pmulld               m2, m4, [cq+64*2+r5]
+    pmulld               m3, m4, [cq+64*3+r5]
    REPX      {paddd x, m5}, m0, m1, m2, m3
    REPX      {psrad x, 13}, m0, m1, m2, m3
    packssdw             m0, m1
@@ -1122,11 +1190,11 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    punpcklwd            m0, m2
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
-    test                r3d, r3d
+    test                r5d, r5d
    jz m(idct_4x16_internal_16bpc).end_pass1
-    mova       [cq+64*0+r3], m0
-    mova       [cq+64*1+r3], m1
-    sub                 r3d, 16
+    mova       [cq+64*0+r5], m0
+    mova       [cq+64*1+r5], m1
+    sub                 r5d, 16
    jmp .loop_pass1
 .pass2:
    mova          [cq+16*4], m0