mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-06-11 08:13:06 +00:00
swscale/aarch64/ops: use plain ret instruction
Use a call/ret pair instead of awkwardly exporting and then jumping
back to the return label.
This is similar to c29465bcb6, but for aarch64.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <ramiro.polla@gmail.com>
This commit is contained in:
@@ -221,7 +221,7 @@ static int aarch64_compile(SwsContext *ctx, const SwsOpList *ops,
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Look up process/process_return functions. */
|
||||
/* Look up process function. */
|
||||
const SwsOp *read = ff_sws_op_list_input(&rest);
|
||||
const SwsOp *write = ff_sws_op_list_output(&rest);
|
||||
const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
|
||||
@@ -230,19 +230,13 @@ static int aarch64_compile(SwsContext *ctx, const SwsOpList *ops,
|
||||
for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
|
||||
MASK_SET(mask, i, 1);
|
||||
|
||||
SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
|
||||
SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
|
||||
SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
|
||||
SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
|
||||
SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params);
|
||||
if (!process_func || !return_func) {
|
||||
if (!process_func) {
|
||||
ret = AVERROR(ENOTSUP);
|
||||
goto error;
|
||||
}
|
||||
|
||||
ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
|
||||
out->func = (SwsOpFunc) process_func;
|
||||
out->cpu_flags = chain->cpu_flags;
|
||||
|
||||
|
||||
@@ -260,14 +260,14 @@ static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n
|
||||
}
|
||||
|
||||
/*********************************************************************/
|
||||
/* Callee-saved registers (r19-r28). */
|
||||
#define MAX_SAVED_REGS 10
|
||||
/* Callee-saved registers (r19-r28, fp, and lr). */
|
||||
#define MAX_SAVED_REGS 12
|
||||
|
||||
static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
|
||||
RasmOp gpr)
|
||||
{
|
||||
const int n = a64op_gpr_n(gpr);
|
||||
if (n >= 19 && n <= 28)
|
||||
if (n >= 19 && n <= 30)
|
||||
regs[(*count)++] = gpr;
|
||||
}
|
||||
|
||||
@@ -276,6 +276,7 @@ static unsigned clobbered_gprs(const SwsAArch64Context *s,
|
||||
RasmOp regs[MAX_SAVED_REGS])
|
||||
{
|
||||
unsigned count = 0;
|
||||
clobber_gpr(regs, &count, a64op_lr());
|
||||
LOOP_MASK(p, i) {
|
||||
clobber_gpr(regs, &count, s->in[i]);
|
||||
clobber_gpr(regs, &count, s->out[i]);
|
||||
@@ -292,9 +293,8 @@ static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p
|
||||
char buf[64];
|
||||
|
||||
/**
|
||||
* The process/process_return functions for aarch64 work similarly
|
||||
* to the x86 backend. The description in x86/ops_include.asm mostly
|
||||
* holds as well here.
|
||||
* The process function for aarch64 works similarly to the x86 backend.
|
||||
* The description in x86/ops_include.asm mostly holds as well here.
|
||||
*/
|
||||
|
||||
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
|
||||
@@ -329,49 +329,38 @@ static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p
|
||||
i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t))));
|
||||
}
|
||||
|
||||
/* Reset x and jump to first kernel. */
|
||||
i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
|
||||
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
|
||||
i_br (r, s->op0_func); CMT("jump to op0_func");
|
||||
}
|
||||
int first_row = rasm_new_label(r, NULL);
|
||||
int next_row = rasm_new_label(r, NULL);
|
||||
int next_block = rasm_new_label(r, NULL);
|
||||
|
||||
static void asmgen_process_return(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
|
||||
{
|
||||
RasmContext *r = s->rctx;
|
||||
char func_name[128];
|
||||
/* Jump to first row (skips padding). */
|
||||
i_b (r, rasm_op_label(first_row)); CMT("goto first_row;");
|
||||
|
||||
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
|
||||
|
||||
rasm_func_begin(r, func_name, true, true);
|
||||
|
||||
/* Reset impl to first kernel. */
|
||||
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
|
||||
|
||||
/* Perform horizontal loop. */
|
||||
int loop = rasm_new_label(r, NULL);
|
||||
i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
|
||||
i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
|
||||
i_bne(r, loop); CMT(" goto loop;");
|
||||
|
||||
/* Perform vertical loop. */
|
||||
int end = rasm_new_label(r, NULL);
|
||||
i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
|
||||
i_cmp(r, s->y, s->y_end); CMT("if (y == y_end)");
|
||||
i_beq(r, end); CMT(" goto end;");
|
||||
|
||||
/* Perform padding and reset x, preparing for next row. */
|
||||
/* Perform padding, preparing for next row. */
|
||||
rasm_add_label(r, next_row); CMT("next_row:");
|
||||
LOOP_MASK(p, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); }
|
||||
LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); }
|
||||
|
||||
/* First row (reset x). */
|
||||
rasm_add_label(r, first_row); CMT("first_row:");
|
||||
i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
|
||||
|
||||
/* Loop back or end of function. */
|
||||
rasm_add_label(r, loop); CMT("loop:");
|
||||
i_br (r, s->op0_func); CMT("jump to op0_func");
|
||||
rasm_add_label(r, end); CMT("end:");
|
||||
/* Reset impl and call first kernel. */
|
||||
rasm_add_label(r, next_block); CMT("next_block:");
|
||||
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
|
||||
i_blr(r, s->op0_func); CMT("op0_func();");
|
||||
|
||||
/* Perform horizontal loop. */
|
||||
i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
|
||||
i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
|
||||
i_bne(r, next_block); CMT(" goto next_block;");
|
||||
|
||||
/* Perform vertical loop. */
|
||||
i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
|
||||
i_cmp(r, s->y, s->y_end); CMT("if (y != y_end)");
|
||||
i_bne(r, next_row); CMT(" goto next_row;");
|
||||
|
||||
/* Function epilogue */
|
||||
RasmOp saved_regs[MAX_SAVED_REGS];
|
||||
unsigned nsaved = clobbered_gprs(s, p, saved_regs);
|
||||
if (nsaved)
|
||||
asmgen_epilogue(s, saved_regs, nsaved);
|
||||
|
||||
@@ -1367,9 +1356,28 @@ static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
|
||||
{
|
||||
RasmContext *r = s->rctx;
|
||||
|
||||
bool is_read = false;
|
||||
bool is_write = false;
|
||||
switch (p->op) {
|
||||
case AARCH64_SWS_OP_READ_BIT:
|
||||
case AARCH64_SWS_OP_READ_NIBBLE:
|
||||
case AARCH64_SWS_OP_READ_PACKED:
|
||||
case AARCH64_SWS_OP_READ_PLANAR:
|
||||
is_read = true;
|
||||
break;
|
||||
case AARCH64_SWS_OP_WRITE_BIT:
|
||||
case AARCH64_SWS_OP_WRITE_NIBBLE:
|
||||
case AARCH64_SWS_OP_WRITE_PACKED:
|
||||
case AARCH64_SWS_OP_WRITE_PLANAR:
|
||||
is_write = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
char func_name[128];
|
||||
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
|
||||
rasm_func_begin(r, func_name, true, true);
|
||||
rasm_func_begin(r, func_name, true, !is_read);
|
||||
|
||||
/**
|
||||
* Set up vector register dimensions and reshape all vectors
|
||||
@@ -1416,14 +1424,18 @@ static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
|
||||
break;
|
||||
}
|
||||
|
||||
/* Load continuation address and increment impl pointer. */
|
||||
RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
|
||||
RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
|
||||
i_ldr(r, s->cont, impl_post); CMT("SwsFuncPtr cont = (impl++)->cont;");
|
||||
rasm_set_current_node(r, node);
|
||||
|
||||
/* Common end for CPS functions. */
|
||||
i_br (r, s->cont); CMT("jump to cont");
|
||||
if (is_write) {
|
||||
/* Write functions return directly. */
|
||||
i_ret(r);
|
||||
} else {
|
||||
/* Load continuation address and increment impl pointer. */
|
||||
RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
|
||||
RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
|
||||
i_ldr(r, s->cont, impl_post); CMT("SwsFuncPtr cont = (impl++)->cont;");
|
||||
rasm_set_current_node(r, node);
|
||||
/* Common end for remaining CPS functions. */
|
||||
i_br (r, s->cont); CMT("jump to cont");
|
||||
}
|
||||
}
|
||||
|
||||
static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
|
||||
@@ -1432,9 +1444,6 @@ static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
|
||||
case AARCH64_SWS_OP_PROCESS:
|
||||
asmgen_process(s, p);
|
||||
break;
|
||||
case AARCH64_SWS_OP_PROCESS_RETURN:
|
||||
asmgen_process_return(s, p);
|
||||
break;
|
||||
default:
|
||||
asmgen_op_cps(s, p);
|
||||
break;
|
||||
@@ -1561,9 +1570,11 @@ static int asmgen(void)
|
||||
|
||||
/**
|
||||
* The entry point of the SwsOpFunc is the `process` function. The
|
||||
* first kernel function is called from `process`, and subsequent
|
||||
* kernel functions are chained by directly branching to the next
|
||||
* operation, using a continuation-passing style design. The exit
|
||||
* point of the SwsOpFunc is the `process_return` function.
|
||||
* operation, using a continuation-passing style design. The last
|
||||
* operation must be a write operation, which returns from the call
|
||||
* to the `process` function.
|
||||
*
|
||||
* The GPRs used by the entire call-chain are listed below.
|
||||
*
|
||||
@@ -1586,6 +1597,9 @@ static int asmgen(void)
|
||||
* The read/write data pointers and padding values first use up the
|
||||
* remaining free caller-saved registers, and only then are the
|
||||
* caller-saved registers (r19-r28) used.
|
||||
*
|
||||
* The Link Register (r30) is used when calling the first kernel,
|
||||
* so it must be saved.
|
||||
*/
|
||||
|
||||
/* SwsOpFunc arguments. */
|
||||
|
||||
@@ -7,10 +7,6 @@
|
||||
{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0011 },
|
||||
{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0111 },
|
||||
{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x1111 },
|
||||
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0001 },
|
||||
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0011 },
|
||||
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0111 },
|
||||
{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x1111 },
|
||||
{ .op = AARCH64_SWS_OP_READ_BIT, .block_size = 8, .type = AARCH64_PIXEL_U8, .mask = 0x0001 },
|
||||
{ .op = AARCH64_SWS_OP_READ_BIT, .block_size = 16, .type = AARCH64_PIXEL_U8, .mask = 0x0001 },
|
||||
{ .op = AARCH64_SWS_OP_READ_NIBBLE, .block_size = 8, .type = AARCH64_PIXEL_U8, .mask = 0x0001 },
|
||||
|
||||
@@ -77,7 +77,6 @@ static const char *aarch64_pixel_type_name(SwsAArch64PixelType fmt)
|
||||
static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = {
|
||||
[AARCH64_SWS_OP_NONE ] = "AARCH64_SWS_OP_NONE",
|
||||
[AARCH64_SWS_OP_PROCESS ] = "AARCH64_SWS_OP_PROCESS",
|
||||
[AARCH64_SWS_OP_PROCESS_RETURN] = "AARCH64_SWS_OP_PROCESS_RETURN",
|
||||
[AARCH64_SWS_OP_READ_BIT ] = "AARCH64_SWS_OP_READ_BIT",
|
||||
[AARCH64_SWS_OP_READ_NIBBLE ] = "AARCH64_SWS_OP_READ_NIBBLE",
|
||||
[AARCH64_SWS_OP_READ_PACKED ] = "AARCH64_SWS_OP_READ_PACKED",
|
||||
@@ -114,7 +113,6 @@ static const char *aarch64_op_type(SwsAArch64OpType op)
|
||||
static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = {
|
||||
[AARCH64_SWS_OP_NONE ] = "none",
|
||||
[AARCH64_SWS_OP_PROCESS ] = "process",
|
||||
[AARCH64_SWS_OP_PROCESS_RETURN] = "process_return",
|
||||
[AARCH64_SWS_OP_READ_BIT ] = "read_bit",
|
||||
[AARCH64_SWS_OP_READ_NIBBLE ] = "read_nibble",
|
||||
[AARCH64_SWS_OP_READ_PACKED ] = "read_packed",
|
||||
@@ -326,7 +324,6 @@ static const ParamField field_dither_size_log2 = { PARAM_FIELD(dither.size_log2)
|
||||
#define MAX_LEVELS 8
|
||||
static const ParamField *op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS] = {
|
||||
[AARCH64_SWS_OP_PROCESS ] = { &field_op, &field_mask },
|
||||
[AARCH64_SWS_OP_PROCESS_RETURN] = { &field_op, &field_mask },
|
||||
[AARCH64_SWS_OP_READ_BIT ] = { &field_op, &field_block_size, &field_type, &field_mask },
|
||||
[AARCH64_SWS_OP_READ_NIBBLE ] = { &field_op, &field_block_size, &field_type, &field_mask },
|
||||
[AARCH64_SWS_OP_READ_PACKED ] = { &field_op, &field_block_size, &field_type, &field_mask },
|
||||
|
||||
@@ -38,7 +38,6 @@ typedef enum SwsAArch64PixelType {
|
||||
typedef enum SwsAArch64OpType {
|
||||
AARCH64_SWS_OP_NONE = 0,
|
||||
AARCH64_SWS_OP_PROCESS,
|
||||
AARCH64_SWS_OP_PROCESS_RETURN,
|
||||
AARCH64_SWS_OP_READ_BIT,
|
||||
AARCH64_SWS_OP_READ_NIBBLE,
|
||||
AARCH64_SWS_OP_READ_PACKED,
|
||||
|
||||
@@ -72,7 +72,7 @@ error:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Collect the parameters for the process/process_return functions. */
|
||||
/* Collect the parameters for the process function. */
|
||||
static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode **root)
|
||||
{
|
||||
const SwsOp *read = ff_sws_op_list_input(ops);
|
||||
@@ -89,11 +89,6 @@ static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode **roo
|
||||
.mask = mask,
|
||||
};
|
||||
|
||||
ret = aarch64_collect_op(¶ms, root);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
params.op = AARCH64_SWS_OP_PROCESS_RETURN;
|
||||
ret = aarch64_collect_op(¶ms, root);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
Reference in New Issue
Block a user