swscale/x86/ops: use plain ret instruction

The original intent here was probably to make the ops code agnostic to
which operation is actually last in the list, but the existence of a
divergence between CONTINUE and FINISH already implies that we hard-code
the assumption that the final operation is a write op.

So we can just massively simplify this with a call/ret pair instead of
awkwardly exporting and then jumping back to the return label. This actually
collapses FINISH down into just a plain RET, since the op kernels already
don't set up any extra stack frame.

Signed-off-by: Niklas Haas <git@haasn.dev>
This commit is contained in:
Niklas Haas
2026-04-11 16:30:15 +00:00
committed by Niklas Haas
co-authored by Niklas Haas
parent f7ca6f7481
commit c29465bcb6
3 changed files with 10 additions and 51 deletions
-3
View File
@@ -1006,9 +1006,6 @@ static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
#define ASSIGN_PROCESS_FUNC(NAME) \
do { \
SWS_DECL_FUNC(NAME); \
void NAME##_return(void); \
ret = ff_sws_op_chain_append(chain, NAME##_return, \
NULL, &(SwsOpPriv) {0}); \
out->func = NAME; \
} while (0)
+1 -19
View File
@@ -26,12 +26,7 @@
; function is responsible for the block loop, as well as initializing the
; plane pointers. It will jump directly into the first operation kernel,
; and each operation kernel will jump directly into the next one, with the
; final kernel jumping back into the sws_process return point. (See label
; `sws_process.return` in ops_int.asm)
;
; To handle the jump back to the return point, we append an extra address
; corresponding to the correct sws_process.return label into the SwsOpChain,
; and have the WRITE kernel jump into it as usual. (See the FINISH macro)
; final kernel returning back into the entry point.
;
; Inside an operation chain, we use a custom calling convention to preserve
; registers between kernels. The exact register allocation is found further
@@ -291,19 +286,6 @@ endstruc
CONTINUE tmp0q
%endmacro
; Final macro to end the operation chain, used by WRITE kernels to jump back
; to the process function return point. Very similar to CONTINUE, but skips
; incrementing the implq pointer, and also clears AVX registers to avoid
; phantom dependencies between loop iterations.
%macro FINISH 1 ; reg
%if vzeroupper_required
; we may jump back into an SSE read, so always zero upper regs here
vzeroupper
%endif
jmp %1
annotate_function_size
%endmacro
; Helper for inline conditionals; used to conditionally include single lines
%macro IF 2+ ; cond, body
%if %1
+9 -29
View File
@@ -93,27 +93,12 @@ IF %1 > 3, mov in3q, [execq + SwsOpExec.in3]
IF %1 > 1, mov out1q, [execq + SwsOpExec.out1]
IF %1 > 2, mov out2q, [execq + SwsOpExec.out2]
IF %1 > 3, mov out3q, [execq + SwsOpExec.out3]
jmp [rsp] ; call into op chain
; Declare a separate global label for the return point, so that we can append
; it to the list of op function pointers from the C code, effectively ensuring
; that we end up here again after the op chain finishes processing a line.
; (See also: cglobal_label in x86inc.asm)
%if FORMAT_ELF
global current_function %+ _return:function hidden
%elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
global current_function %+ _return:private_extern
%else
global current_function %+ _return
%endif
align function_align
current_function %+ _return:
; op chain always returns back here
.loop:
call [rsp] ; call into op chain
mov implq, [rsp + 8]
inc bxd
cmp bxd, [rsp + 20]
jne .continue
jne .loop
; end of line
inc yd
cmp yd, [rsp + 24]
@@ -131,7 +116,7 @@ IF %1 > 3, add out3q, [execq + SwsOpExec.out_bump3]
; conditionally apply y bump (if non-NULL)
mov tmp0q, [execq + SwsOpExec.in_bump_y]
test tmp0q, tmp0q
jz .continue
jz .loop
movsxd tmp0q, [tmp0q + yq * 4 - 4] ; load (signed) y bump
%if %1 > 3
mov tmp1q, tmp0q
@@ -150,8 +135,7 @@ IF %1 > 3, add out3q, [execq + SwsOpExec.out_bump3]
%endif
imul tmp0q, [execq + SwsOpExec.in_stride0]
add in0q, tmp0q
.continue:
jmp [rsp]
jmp .loop
.end:
add rsp, 32
RET
@@ -271,7 +255,6 @@ IF %1 > 3, add in3q, mmsize * (1 + V2)
%macro write_planar 1 ; elems
op write_planar%1
LOAD_CONT tmp0q
movu [out0q], mx
IF %1 > 1, movu [out1q], my
IF %1 > 2, movu [out2q], mz
@@ -286,7 +269,7 @@ IF %1 > 3, movu [out3q + mmsize], mw2
IF %1 > 1, add out1q, mmsize * (1 + V2)
IF %1 > 2, add out2q, mmsize * (1 + V2)
IF %1 > 3, add out3q, mmsize * (1 + V2)
FINISH tmp0q
RET
%endmacro
%macro read_packed2 1 ; depth
@@ -325,7 +308,6 @@ IF %1 < 32, VBROADCASTI128 m12, [read%1_unpack2]
%macro write_packed2 1 ; depth
op write%1_packed2
IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2]
LOAD_CONT tmp0q
%if cpuflag(avx2)
vpermq mx, mx, q3120 ; { X0 X2 | X1 X3 }
vpermq my, my, q3120 ; { Y0 Y2 | Y1 Y3 }
@@ -352,7 +334,7 @@ IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2]
IF V2, movu [out0q + 2*mmsize], m10
IF V2, movu [out0q + 3*mmsize], m11
add out0q, mmsize * (2 + V2 * 2)
FINISH tmp0q
RET
%endmacro
; helper macro reused for both 3 and 4 component packed reads
@@ -433,11 +415,10 @@ IF1 V2, read_packed_inner mx2, my2, mz2, mw2, in0q + %1 * mmsize, %1, %2
%macro write_packed 2 ; num, depth
op write%2_packed%1
IF %2 < 32, VBROADCASTI128 m12, [write%2_pack%1]
LOAD_CONT tmp0q
write_packed_inner mx, my, mz, mw, out0q, %1, %2
IF1 V2, write_packed_inner mx2, my2, mz2, mw2, out0q + %1 * mmsize, %1, %2
add out0q, %1 * mmsize * (1 + V2)
FINISH tmp0q
RET
%endmacro
%macro rw_packed 1 ; depth
@@ -512,9 +493,8 @@ IF V2, pshufb mx2, m8
IF V2, pmovmskb tmp1d, mx2
mov [out0q], tmp0d
IF V2, mov [out0q + (mmsize >> 3)], tmp1d
LOAD_CONT tmp0q
add out0q, (mmsize >> 3) * (1 + V2)
FINISH tmp0q
RET
%endmacro
;--------------------------