swscale/x86/ops: use plain ret instruction

The original intent here was probably to make the ops code agnostic to which operation is actually last in the list, but the existence of a divergence between CONTINUE and FINISH already implies that we hard-code the assumption that the final operation is a write op. So we can just massively simplify this with a call/ret pair instead of awkwardly exporting and then jumping back to the return label. This actually collapses FINISH down into just a plain RET, since the op kernels already don't set up any extra stack frame. Signed-off-by: Niklas Haas <git@haasn.dev>
2026-06-11 08:13:06 +00:00 · 2026-04-11 16:30:15 +00:00
parent f7ca6f7481
commit c29465bcb6
3 changed files with 10 additions and 51 deletions
@@ -1006,9 +1006,6 @@ static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
 #define ASSIGN_PROCESS_FUNC(NAME)                               \
    do {                                                        \
        SWS_DECL_FUNC(NAME);                                    \
-        void NAME##_return(void);                               \
-        ret = ff_sws_op_chain_append(chain, NAME##_return,      \
-                                     NULL, &(SwsOpPriv) {0});   \
        out->func = NAME;                                       \
    } while (0)

@@ -26,12 +26,7 @@
 ; function is responsible for the block loop, as well as initializing the
 ; plane pointers. It will jump directly into the first operation kernel,
 ; and each operation kernel will jump directly into the next one, with the
-; final kernel jumping back into the sws_process return point. (See label
-; `sws_process.return` in ops_int.asm)
-;
-; To handle the jump back to the return point, we append an extra address
-; corresponding to the correct sws_process.return label into the SwsOpChain,
-; and have the WRITE kernel jump into it as usual. (See the FINISH macro)
+; final kernel returning back into the entry point.
 ;
 ; Inside an operation chain, we use a custom calling convention to preserve
 ; registers between kernels. The exact register allocation is found further
@@ -291,19 +286,6 @@ endstruc
    CONTINUE tmp0q
 %endmacro

-; Final macro to end the operation chain, used by WRITE kernels to jump back
-; to the process function return point. Very similar to CONTINUE, but skips
-; incrementing the implq pointer, and also clears AVX registers to avoid
-; phantom dependencies between loop iterations.
-%macro FINISH 1 ; reg
-    %if vzeroupper_required
-        ; we may jump back into an SSE read, so always zero upper regs here
-        vzeroupper
-    %endif
-    jmp %1
-    annotate_function_size
-%endmacro
-
 ; Helper for inline conditionals; used to conditionally include single lines
 %macro IF 2+ ; cond, body
    %if %1
@@ -93,27 +93,12 @@ IF %1 > 3,  mov in3q,  [execq + SwsOpExec.in3]
 IF %1 > 1,  mov out1q, [execq + SwsOpExec.out1]
 IF %1 > 2,  mov out2q, [execq + SwsOpExec.out2]
 IF %1 > 3,  mov out3q, [execq + SwsOpExec.out3]
-            jmp [rsp] ; call into op chain
-
-; Declare a separate global label for the return point, so that we can append
-; it to the list of op function pointers from the C code, effectively ensuring
-; that we end up here again after the op chain finishes processing a line.
-; (See also: cglobal_label in x86inc.asm)
-%if FORMAT_ELF
-    global current_function %+ _return:function hidden
-%elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
-    global current_function %+ _return:private_extern
-%else
-    global current_function %+ _return
-%endif
-align function_align
-current_function %+ _return:
-
-            ; op chain always returns back here
+.loop:
+            call [rsp] ; call into op chain
            mov implq, [rsp + 8]
            inc bxd
            cmp bxd, [rsp + 20]
-            jne .continue
+            jne .loop
            ; end of line
            inc yd
            cmp yd, [rsp + 24]
@@ -131,7 +116,7 @@ IF %1 > 3,  add out3q, [execq + SwsOpExec.out_bump3]
            ; conditionally apply y bump (if non-NULL)
            mov tmp0q, [execq + SwsOpExec.in_bump_y]
            test tmp0q, tmp0q
-            jz .continue
+            jz .loop
            movsxd tmp0q, [tmp0q + yq * 4 - 4] ; load (signed) y bump
 %if %1 > 3
            mov tmp1q, tmp0q
@@ -150,8 +135,7 @@ IF %1 > 3,  add out3q, [execq + SwsOpExec.out_bump3]
 %endif
            imul tmp0q, [execq + SwsOpExec.in_stride0]
            add in0q, tmp0q
-.continue:
-            jmp [rsp]
+            jmp .loop
 .end:
            add rsp, 32
            RET
@@ -271,7 +255,6 @@ IF %1 > 3,  add in3q, mmsize * (1 + V2)

 %macro write_planar 1 ; elems
 op write_planar%1
-            LOAD_CONT tmp0q
            movu [out0q], mx
 IF %1 > 1,  movu [out1q], my
 IF %1 > 2,  movu [out2q], mz
@@ -286,7 +269,7 @@ IF %1 > 3,  movu [out3q + mmsize], mw2
 IF %1 > 1,  add out1q, mmsize * (1 + V2)
 IF %1 > 2,  add out2q, mmsize * (1 + V2)
 IF %1 > 3,  add out3q, mmsize * (1 + V2)
-            FINISH tmp0q
+            RET
 %endmacro

 %macro read_packed2 1 ; depth
@@ -325,7 +308,6 @@ IF %1 < 32, VBROADCASTI128 m12, [read%1_unpack2]
 %macro write_packed2 1 ; depth
 op write%1_packed2
 IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2]
-            LOAD_CONT tmp0q
 %if cpuflag(avx2)
            vpermq mx, mx, q3120       ; { X0 X2 | X1 X3 }
            vpermq my, my, q3120       ; { Y0 Y2 | Y1 Y3 }
@@ -352,7 +334,7 @@ IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2]
 IF V2,      movu [out0q + 2*mmsize], m10
 IF V2,      movu [out0q + 3*mmsize], m11
            add out0q, mmsize * (2 + V2 * 2)
-            FINISH tmp0q
+            RET
 %endmacro

 ; helper macro reused for both 3 and 4 component packed reads
@@ -433,11 +415,10 @@ IF1 V2,     read_packed_inner mx2, my2, mz2, mw2, in0q + %1 * mmsize, %1, %2
 %macro write_packed 2 ; num, depth
 op write%2_packed%1
 IF %2 < 32, VBROADCASTI128 m12, [write%2_pack%1]
-            LOAD_CONT tmp0q
            write_packed_inner mx, my, mz, mw, out0q, %1, %2
 IF1 V2,     write_packed_inner mx2, my2, mz2, mw2, out0q + %1 * mmsize, %1, %2
            add out0q, %1 * mmsize * (1 + V2)
-            FINISH tmp0q
+            RET
 %endmacro

 %macro rw_packed 1 ; depth
@@ -512,9 +493,8 @@ IF V2,  pshufb mx2, m8
 IF V2,  pmovmskb tmp1d, mx2
        mov [out0q],     tmp0d
 IF V2,  mov [out0q + (mmsize >> 3)], tmp1d
-        LOAD_CONT tmp0q
        add out0q, (mmsize >> 3) * (1 + V2)
-        FINISH tmp0q
+        RET
 %endmacro

 ;--------------------------