swscale/aarch64/ops: use plain ret instruction

Use a call/ret pair instead of awkwardly exporting and then jumping
back to the return label.

This is similar to c29465bcb6, but for aarch64.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <ramiro.polla@gmail.com>
This commit is contained in:
Ramiro Polla
2026-06-10 01:47:10 +02:00
parent 061dc9ab6d
commit 19250a1846
6 changed files with 73 additions and 78 deletions
+3 -9
View File
@@ -221,7 +221,7 @@ static int aarch64_compile(SwsContext *ctx, const SwsOpList *ops,
goto error;
}
/* Look up process/process_return functions. */
/* Look up process function. */
const SwsOp *read = ff_sws_op_list_input(&rest);
const SwsOp *write = ff_sws_op_list_output(&rest);
const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
@@ -230,19 +230,13 @@ static int aarch64_compile(SwsContext *ctx, const SwsOpList *ops,
for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
MASK_SET(mask, i, 1);
SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params);
if (!process_func || !return_func) {
if (!process_func) {
ret = AVERROR(ENOTSUP);
goto error;
}
ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
if (ret < 0)
goto error;
out->func = (SwsOpFunc) process_func;
out->cpu_flags = chain->cpu_flags;
+69 -55
View File
@@ -260,14 +260,14 @@ static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n
}
/*********************************************************************/
/* Callee-saved registers (r19-r28). */
#define MAX_SAVED_REGS 10
/* Callee-saved registers (r19-r28, fp, and lr). */
#define MAX_SAVED_REGS 12
static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
RasmOp gpr)
{
const int n = a64op_gpr_n(gpr);
if (n >= 19 && n <= 28)
if (n >= 19 && n <= 30)
regs[(*count)++] = gpr;
}
@@ -276,6 +276,7 @@ static unsigned clobbered_gprs(const SwsAArch64Context *s,
RasmOp regs[MAX_SAVED_REGS])
{
unsigned count = 0;
clobber_gpr(regs, &count, a64op_lr());
LOOP_MASK(p, i) {
clobber_gpr(regs, &count, s->in[i]);
clobber_gpr(regs, &count, s->out[i]);
@@ -292,9 +293,8 @@ static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p
char buf[64];
/**
* The process/process_return functions for aarch64 work similarly
* to the x86 backend. The description in x86/ops_include.asm mostly
* holds as well here.
* The process function for aarch64 works similarly to the x86 backend.
* The description in x86/ops_include.asm mostly holds as well here.
*/
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
@@ -329,49 +329,38 @@ static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p
i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t))));
}
/* Reset x and jump to first kernel. */
i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
i_br (r, s->op0_func); CMT("jump to op0_func");
}
int first_row = rasm_new_label(r, NULL);
int next_row = rasm_new_label(r, NULL);
int next_block = rasm_new_label(r, NULL);
static void asmgen_process_return(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
char func_name[128];
/* Jump to first row (skips padding). */
i_b (r, rasm_op_label(first_row)); CMT("goto first_row;");
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
rasm_func_begin(r, func_name, true, true);
/* Reset impl to first kernel. */
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
/* Perform horizontal loop. */
int loop = rasm_new_label(r, NULL);
i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
i_bne(r, loop); CMT(" goto loop;");
/* Perform vertical loop. */
int end = rasm_new_label(r, NULL);
i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
i_cmp(r, s->y, s->y_end); CMT("if (y == y_end)");
i_beq(r, end); CMT(" goto end;");
/* Perform padding and reset x, preparing for next row. */
/* Perform padding, preparing for next row. */
rasm_add_label(r, next_row); CMT("next_row:");
LOOP_MASK(p, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); }
LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); }
/* First row (reset x). */
rasm_add_label(r, first_row); CMT("first_row:");
i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
/* Loop back or end of function. */
rasm_add_label(r, loop); CMT("loop:");
i_br (r, s->op0_func); CMT("jump to op0_func");
rasm_add_label(r, end); CMT("end:");
/* Reset impl and call first kernel. */
rasm_add_label(r, next_block); CMT("next_block:");
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
i_blr(r, s->op0_func); CMT("op0_func();");
/* Perform horizontal loop. */
i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
i_bne(r, next_block); CMT(" goto next_block;");
/* Perform vertical loop. */
i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
i_cmp(r, s->y, s->y_end); CMT("if (y != y_end)");
i_bne(r, next_row); CMT(" goto next_row;");
/* Function epilogue */
RasmOp saved_regs[MAX_SAVED_REGS];
unsigned nsaved = clobbered_gprs(s, p, saved_regs);
if (nsaved)
asmgen_epilogue(s, saved_regs, nsaved);
@@ -1367,9 +1356,28 @@ static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
bool is_read = false;
bool is_write = false;
switch (p->op) {
case AARCH64_SWS_OP_READ_BIT:
case AARCH64_SWS_OP_READ_NIBBLE:
case AARCH64_SWS_OP_READ_PACKED:
case AARCH64_SWS_OP_READ_PLANAR:
is_read = true;
break;
case AARCH64_SWS_OP_WRITE_BIT:
case AARCH64_SWS_OP_WRITE_NIBBLE:
case AARCH64_SWS_OP_WRITE_PACKED:
case AARCH64_SWS_OP_WRITE_PLANAR:
is_write = true;
break;
default:
break;
}
char func_name[128];
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
rasm_func_begin(r, func_name, true, true);
rasm_func_begin(r, func_name, true, !is_read);
/**
* Set up vector register dimensions and reshape all vectors
@@ -1416,14 +1424,18 @@ static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
break;
}
/* Load continuation address and increment impl pointer. */
RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
i_ldr(r, s->cont, impl_post); CMT("SwsFuncPtr cont = (impl++)->cont;");
rasm_set_current_node(r, node);
/* Common end for CPS functions. */
i_br (r, s->cont); CMT("jump to cont");
if (is_write) {
/* Write functions return directly. */
i_ret(r);
} else {
/* Load continuation address and increment impl pointer. */
RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
i_ldr(r, s->cont, impl_post); CMT("SwsFuncPtr cont = (impl++)->cont;");
rasm_set_current_node(r, node);
/* Common end for remaining CPS functions. */
i_br (r, s->cont); CMT("jump to cont");
}
}
static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
@@ -1432,9 +1444,6 @@ static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
case AARCH64_SWS_OP_PROCESS:
asmgen_process(s, p);
break;
case AARCH64_SWS_OP_PROCESS_RETURN:
asmgen_process_return(s, p);
break;
default:
asmgen_op_cps(s, p);
break;
@@ -1561,9 +1570,11 @@ static int asmgen(void)
/**
* The entry point of the SwsOpFunc is the `process` function. The
* first kernel function is called from `process`, and subsequent
* kernel functions are chained by directly branching to the next
* operation, using a continuation-passing style design. The exit
* point of the SwsOpFunc is the `process_return` function.
* operation, using a continuation-passing style design. The last
* operation must be a write operation, which returns from the call
* to the `process` function.
*
* The GPRs used by the entire call-chain are listed below.
*
@@ -1586,6 +1597,9 @@ static int asmgen(void)
* The read/write data pointers and padding values first use up the
* remaining free caller-saved registers, and only then are the
* caller-saved registers (r19-r28) used.
*
* The Link Register (r30) is used when calling the first kernel,
* so it must be saved.
*/
/* SwsOpFunc arguments. */
-4
View File
@@ -7,10 +7,6 @@
{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0011 },
{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0111 },
{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x1111 },
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0001 },
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0011 },
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0111 },
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x1111 },
{ .op = AARCH64_SWS_OP_READ_BIT, .block_size = 8, .type = AARCH64_PIXEL_U8, .mask = 0x0001 },
{ .op = AARCH64_SWS_OP_READ_BIT, .block_size = 16, .type = AARCH64_PIXEL_U8, .mask = 0x0001 },
{ .op = AARCH64_SWS_OP_READ_NIBBLE, .block_size = 8, .type = AARCH64_PIXEL_U8, .mask = 0x0001 },
-3
View File
@@ -77,7 +77,6 @@ static const char *aarch64_pixel_type_name(SwsAArch64PixelType fmt)
static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = {
[AARCH64_SWS_OP_NONE ] = "AARCH64_SWS_OP_NONE",
[AARCH64_SWS_OP_PROCESS ] = "AARCH64_SWS_OP_PROCESS",
[AARCH64_SWS_OP_PROCESS_RETURN] = "AARCH64_SWS_OP_PROCESS_RETURN",
[AARCH64_SWS_OP_READ_BIT ] = "AARCH64_SWS_OP_READ_BIT",
[AARCH64_SWS_OP_READ_NIBBLE ] = "AARCH64_SWS_OP_READ_NIBBLE",
[AARCH64_SWS_OP_READ_PACKED ] = "AARCH64_SWS_OP_READ_PACKED",
@@ -114,7 +113,6 @@ static const char *aarch64_op_type(SwsAArch64OpType op)
static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = {
[AARCH64_SWS_OP_NONE ] = "none",
[AARCH64_SWS_OP_PROCESS ] = "process",
[AARCH64_SWS_OP_PROCESS_RETURN] = "process_return",
[AARCH64_SWS_OP_READ_BIT ] = "read_bit",
[AARCH64_SWS_OP_READ_NIBBLE ] = "read_nibble",
[AARCH64_SWS_OP_READ_PACKED ] = "read_packed",
@@ -326,7 +324,6 @@ static const ParamField field_dither_size_log2 = { PARAM_FIELD(dither.size_log2)
#define MAX_LEVELS 8
static const ParamField *op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS] = {
[AARCH64_SWS_OP_PROCESS ] = { &field_op, &field_mask },
[AARCH64_SWS_OP_PROCESS_RETURN] = { &field_op, &field_mask },
[AARCH64_SWS_OP_READ_BIT ] = { &field_op, &field_block_size, &field_type, &field_mask },
[AARCH64_SWS_OP_READ_NIBBLE ] = { &field_op, &field_block_size, &field_type, &field_mask },
[AARCH64_SWS_OP_READ_PACKED ] = { &field_op, &field_block_size, &field_type, &field_mask },
-1
View File
@@ -38,7 +38,6 @@ typedef enum SwsAArch64PixelType {
typedef enum SwsAArch64OpType {
AARCH64_SWS_OP_NONE = 0,
AARCH64_SWS_OP_PROCESS,
AARCH64_SWS_OP_PROCESS_RETURN,
AARCH64_SWS_OP_READ_BIT,
AARCH64_SWS_OP_READ_NIBBLE,
AARCH64_SWS_OP_READ_PACKED,
+1 -6
View File
@@ -72,7 +72,7 @@ error:
return ret;
}
/* Collect the parameters for the process/process_return functions. */
/* Collect the parameters for the process function. */
static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode **root)
{
const SwsOp *read = ff_sws_op_list_input(ops);
@@ -89,11 +89,6 @@ static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode **roo
.mask = mask,
};
ret = aarch64_collect_op(&params, root);
if (ret < 0)
return ret;
params.op = AARCH64_SWS_OP_PROCESS_RETURN;
ret = aarch64_collect_op(&params, root);
if (ret < 0)
return ret;