Files
Niklas HaasandNiklas Haas 625ab011f4 swscale/uops: add default fallback for translate_op()
Makes it a bit easier to add ops and uops in separate commits.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
2026-06-11 16:27:47 +00:00

1070 lines
34 KiB
C

/**
* Copyright (C) 2026 Niklas Haas
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdbool.h>
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/refstruct.h"
#include "libavutil/tree.h"
#include "ops.h"
#include "ops_internal.h"
#include "uops.h"
int ff_sws_uop_cmp(const SwsUOp *a, const SwsUOp *b)
{
if (a->type != b->type)
return (int) a->type - b->type;
if (a->uop != b->uop)
return (int) a->uop - b->uop;
if (a->mask != b->mask)
return (int) a->mask - b->mask;
return memcmp(&a->par, &b->par, sizeof(a->par));
}
static const struct {
char full[32];
char abbr[32];
char macro[32];
} uop_names[SWS_UOP_TYPE_NB] = {
#define UOP_NAME(OP, ABBR) [SWS_UOP_##OP] = { "SWS_UOP_" #OP, ABBR, #OP }
UOP_NAME(INVALID, "invalid"),
UOP_NAME(READ_PLANAR, "read_planar"),
UOP_NAME(READ_PLANAR_FH, "read_planar_fh"),
UOP_NAME(READ_PLANAR_FV, "read_planar_fv"),
UOP_NAME(READ_PLANAR_FV_FMA,"read_planar_fv_fma"),
UOP_NAME(READ_PACKED, "read_packed"),
UOP_NAME(READ_NIBBLE, "read_nibble"),
UOP_NAME(READ_BIT, "read_bit"),
UOP_NAME(WRITE_PLANAR, "write_planar"),
UOP_NAME(WRITE_PACKED, "write_packed"),
UOP_NAME(WRITE_NIBBLE, "write_nibble"),
UOP_NAME(WRITE_BIT, "write_bit"),
UOP_NAME(PERMUTE, "permute"),
UOP_NAME(COPY, "copy"),
UOP_NAME(MOVE, "move"),
UOP_NAME(SWAP_BYTES, "swap_bytes"),
UOP_NAME(EXPAND_BIT, "expand_bit"),
UOP_NAME(EXPAND_PAIR, "expand_pair"),
UOP_NAME(EXPAND_QUAD, "expand_quad"),
UOP_NAME(TO_U8, "to_u8"),
UOP_NAME(TO_U16, "to_u16"),
UOP_NAME(TO_U32, "to_u32"),
UOP_NAME(TO_F32, "to_f32"),
UOP_NAME(SCALE, "scale"),
UOP_NAME(LINEAR, "linear"),
UOP_NAME(LINEAR_FMA, "linear_fma"),
UOP_NAME(ADD, "add"),
UOP_NAME(MIN, "min"),
UOP_NAME(MAX, "max"),
UOP_NAME(UNPACK, "unpack"),
UOP_NAME(PACK, "pack"),
UOP_NAME(LSHIFT, "lshift"),
UOP_NAME(RSHIFT, "rshift"),
UOP_NAME(CLEAR, "clear"),
UOP_NAME(DITHER, "dither"),
#undef UOP_NAME
};
static const struct {
char full[16];
char prefix[8];
} pixel_types[SWS_PIXEL_TYPE_NB] = {
[SWS_PIXEL_NONE] = { "SWS_PIXEL_NONE", "" },
[SWS_PIXEL_U8] = { "SWS_PIXEL_U8", "U8_" },
[SWS_PIXEL_U16] = { "SWS_PIXEL_U16", "U16_" },
[SWS_PIXEL_U32] = { "SWS_PIXEL_U32", "U32_" },
[SWS_PIXEL_F32] = { "SWS_PIXEL_F32", "F32_" },
};
static SwsPixel pixel_from_q(SwsPixelType type, AVRational val)
{
av_assert1(val.den != 0);
switch (type) {
case SWS_PIXEL_U8: return (SwsPixel) { .u8 = val.num / val.den };
case SWS_PIXEL_U16: return (SwsPixel) { .u16 = val.num / val.den };
case SWS_PIXEL_U32: return (SwsPixel) { .u32 = val.num / val.den };
case SWS_PIXEL_F32: return (SwsPixel) { .f32 = (float) val.num / val.den };
case SWS_PIXEL_NONE:
case SWS_PIXEL_TYPE_NB: break;
}
av_unreachable("Invalid pixel type!");
return (SwsPixel) {0};
}
#define Q2PIXEL(val) pixel_from_q(op->type, val)
static bool pixel_is_1s(SwsPixelType type, SwsPixel val)
{
switch (ff_sws_pixel_type_size(type)) {
case 1: return val.u8 == UINT8_MAX;
case 2: return val.u16 == UINT16_MAX;
case 4: return val.u32 == UINT32_MAX;
default: break;
}
av_unreachable("Invalid pixel type!");
return false;
}
void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX])
{
AVBPrint bp;
av_bprint_init_for_buffer(&bp, buf, SWS_UOP_NAME_MAX);
if (op->type != SWS_PIXEL_NONE)
av_bprintf(&bp, "%s_", ff_sws_pixel_type_name(op->type));
av_bprintf(&bp, "%s", uop_names[op->uop].abbr);
if (op->mask) {
av_bprint_chars(&bp, '_', 1);
for (int i = 0; i < 4; i++) {
if (SWS_COMP_TEST(op->mask, i))
av_bprint_chars(&bp, "xyzw"[i], 1);
}
}
const SwsUOpParams *par = &op->par;
switch (op->uop) {
case SWS_UOP_READ_PLANAR_FH:
case SWS_UOP_READ_PLANAR_FV:
case SWS_UOP_READ_PLANAR_FV_FMA:
av_bprintf(&bp, "_%s", ff_sws_pixel_type_name(par->filter.type));
break;
case SWS_UOP_LSHIFT:
case SWS_UOP_RSHIFT:
av_bprintf(&bp, "_%u", par->shift.amount);
break;
case SWS_UOP_PERMUTE:
case SWS_UOP_COPY:
av_bprint_chars(&bp, '_', 1);
for (int i = 0; i < 4; i++) {
if (SWS_COMP_TEST(op->mask, i))
av_bprint_chars(&bp, "xyzw"[par->swizzle.in[i]], 1);
}
break;
case SWS_UOP_MOVE:
av_bprint_chars(&bp, '_', 1);
for (int i = 0; i < par->move.num_moves; i++)
av_bprint_chars(&bp, "txyzw"[par->move.dst[i] + 1], 1);
av_bprint_chars(&bp, '_', 1);
for (int i = 0; i < par->move.num_moves; i++)
av_bprint_chars(&bp, "txyzw"[par->move.src[i] + 1], 1);
break;
case SWS_UOP_PACK:
case SWS_UOP_UNPACK:
av_bprint_chars(&bp, '_', 1);
for (int i = 0; i < 4 && par->pack.pattern[i]; i++)
av_bprintf(&bp, "%x", par->pack.pattern[i]);
break;
case SWS_UOP_CLEAR:
av_bprint_chars(&bp, '_', 1);
for (int i = 0; i < 4; i++) {
if (!SWS_COMP_TEST(op->mask, i))
continue;
else if (SWS_COMP_TEST(par->clear.one, i))
av_bprint_chars(&bp, '1', 1);
else if (SWS_COMP_TEST(par->clear.zero, i))
av_bprint_chars(&bp, '0', 1);
else
av_bprint_chars(&bp, 'x', 1);
}
break;
case SWS_UOP_LINEAR:
case SWS_UOP_LINEAR_FMA:
for (int i = 0; i < 4; i++) {
if (!SWS_COMP_TEST(op->mask, i))
continue;
av_bprint_chars(&bp, '_', 1);
for (int j = 0; j < 5; j++) {
if (par->lin.one & SWS_MASK(i, j))
av_bprint_chars(&bp, '1', 1);
else if (par->lin.zero & SWS_MASK(i, j))
av_bprint_chars(&bp, '0', 1);
else if (par->lin.exact & SWS_MASK(i, j))
av_bprint_chars(&bp, 'X', 1);
else
av_bprint_chars(&bp, 'x', 1);
}
}
break;
case SWS_UOP_DITHER:
for (int i = 0; i < 4; i++) {
if (SWS_COMP_TEST(op->mask, i))
av_bprintf(&bp, "_%d", par->dither.y_offset[i]);
}
const unsigned size = 1u << par->dither.size_log2;
av_bprintf(&bp, "_%ux%u", size, size);
break;
}
av_assert0(av_bprint_is_complete(&bp));
}
static int generate_entry_struct(void *opaque, void *key)
{
const SwsUOp *ref = opaque;
const SwsUOp *uop = key;
AVBPrint *bp = ref->data.opaque;
char name[SWS_UOP_NAME_MAX];
ff_sws_uop_name(uop, name);
av_bprintf(bp, " \\\n MACRO(__VA_ARGS__, %-40s", name);
av_bprintf(bp, ", .type = %-13s, .uop = %-24s, .mask = 0x%x",
pixel_types[uop->type].full, uop_names[uop->uop].full, uop->mask);
const SwsUOpParams *par = &uop->par;
switch (uop->uop) {
case SWS_UOP_READ_PLANAR_FH:
case SWS_UOP_READ_PLANAR_FV:
case SWS_UOP_READ_PLANAR_FV_FMA:
av_bprintf(bp, ", .par.filter.type = %s", pixel_types[par->filter.type].full);
break;
case SWS_UOP_LSHIFT:
case SWS_UOP_RSHIFT:
av_bprintf(bp, ", .par.shift.amount = %u", par->shift.amount);
break;
case SWS_UOP_PERMUTE:
case SWS_UOP_COPY:
av_bprintf(bp, ", .par.swizzle.in = {%d, %d, %d, %d}",
par->swizzle.in[0], par->swizzle.in[1],
par->swizzle.in[2], par->swizzle.in[3]);
break;
case SWS_UOP_MOVE:
av_bprintf(bp, ", .par.move.num_moves = %d", par->move.num_moves);
av_bprintf(bp, ", .par.move.dst = {%d, %d, %d, %d, %d, %d}",
par->move.dst[0], par->move.dst[1], par->move.dst[2],
par->move.dst[3], par->move.dst[4], par->move.dst[5]);
av_bprintf(bp, ", .par.move.src = {%d, %d, %d, %d, %d, %d}",
par->move.src[0], par->move.src[1], par->move.src[2],
par->move.src[3], par->move.src[4], par->move.src[5]);
break;
case SWS_UOP_PACK:
case SWS_UOP_UNPACK:
av_bprintf(bp, ", .par.pack.pattern = {%d, %d, %d, %d}",
par->pack.pattern[0], par->pack.pattern[1],
par->pack.pattern[2], par->pack.pattern[3]);
break;
case SWS_UOP_CLEAR:
av_bprintf(bp, ", .par.clear.one = 0x%x, .par.clear.zero = 0x%x",
par->clear.one, par->clear.zero);
break;
case SWS_UOP_LINEAR:
case SWS_UOP_LINEAR_FMA:
av_bprintf(bp, ", .par.lin.one = 0x%x, .par.lin.zero = 0x%x",
par->lin.one, par->lin.zero);
if (uop->uop == SWS_UOP_LINEAR_FMA)
av_bprintf(bp, ", .par.lin.exact = 0x%x", par->lin.exact);
break;
case SWS_UOP_DITHER:
av_bprintf(bp, ", .par.dither = { .y_offset = {%u, %u, %u, %u}, .size_log2 = %u }",
par->dither.y_offset[0], par->dither.y_offset[1],
par->dither.y_offset[2], par->dither.y_offset[3],
par->dither.size_log2);
break;
}
av_bprintf(bp, ")");
return 0;
}
static int generate_entry_args(void *opaque, void *key)
{
const SwsUOp *ref = opaque;
const SwsUOp *uop = key;
AVBPrint *bp = ref->data.opaque;
char name[SWS_UOP_NAME_MAX];
ff_sws_uop_name(uop, name);
av_bprintf(bp, " \\\n MACRO(__VA_ARGS__, %-40s, %-13s, %-24s, 0x%x",
name, pixel_types[uop->type].full, uop_names[uop->uop].full, uop->mask);
const SwsUOpParams *par = &uop->par;
switch (uop->uop) {
case SWS_UOP_READ_PLANAR_FH:
case SWS_UOP_READ_PLANAR_FV:
case SWS_UOP_READ_PLANAR_FV_FMA:
av_bprintf(bp, ", %s", pixel_types[par->filter.type].full);
break;
case SWS_UOP_LSHIFT:
case SWS_UOP_RSHIFT:
av_bprintf(bp, ", %u", par->shift.amount);
break;
case SWS_UOP_PERMUTE:
case SWS_UOP_COPY:
av_bprintf(bp, ", %d, %d, %d, %d",
par->swizzle.in[0], par->swizzle.in[1],
par->swizzle.in[2], par->swizzle.in[3]);
break;
case SWS_UOP_MOVE:
av_bprintf(bp, ", %d", par->move.num_moves);
av_bprintf(bp, ", %d, %d, %d, %d, %d, %d",
par->move.dst[0], par->move.dst[1], par->move.dst[2],
par->move.dst[3], par->move.dst[4], par->move.dst[5]);
av_bprintf(bp, ", %d, %d, %d, %d, %d, %d",
par->move.src[0], par->move.src[1], par->move.src[2],
par->move.src[3], par->move.src[4], par->move.src[5]);
break;
case SWS_UOP_PACK:
case SWS_UOP_UNPACK:
av_bprintf(bp, ", %d, %d, %d, %d",
par->pack.pattern[0], par->pack.pattern[1],
par->pack.pattern[2], par->pack.pattern[3]);
break;
case SWS_UOP_CLEAR:
av_bprintf(bp, ", 0x%05x, 0x%05x", par->clear.one, par->clear.zero);
break;
case SWS_UOP_LINEAR:
case SWS_UOP_LINEAR_FMA:
av_bprintf(bp, ", 0x%05x, 0x%05x", par->lin.one, par->lin.zero);
if (uop->uop == SWS_UOP_LINEAR_FMA)
av_bprintf(bp, ", 0x%05x", par->lin.exact);
break;
case SWS_UOP_DITHER:
av_bprintf(bp, ", %u, %u, %u, %u, %u",
par->dither.y_offset[0], par->dither.y_offset[1],
par->dither.y_offset[2], par->dither.y_offset[3],
par->dither.size_log2);
break;
}
av_bprintf(bp, ")");
return 0;
}
static void uop_uninit(SwsUOp *uop)
{
switch (uop->uop) {
case SWS_UOP_DITHER:
av_refstruct_unref(&uop->data.ptr);
break;
case SWS_UOP_READ_PLANAR_FH:
case SWS_UOP_READ_PLANAR_FV:
case SWS_UOP_READ_PLANAR_FV_FMA:
av_refstruct_unref(&uop->data.kernel);
break;
}
*uop = (SwsUOp) {0};
}
void ff_sws_uop_list_free(SwsUOpList **p_ops)
{
SwsUOpList *ops = *p_ops;
if (!ops)
return;
for (int i = 0; i < ops->num_ops; i++)
uop_uninit(&ops->ops[i]);
av_freep(&ops->ops);
av_free(ops);
*p_ops = NULL;
}
SwsUOpList *ff_sws_uop_list_alloc(void)
{
return av_mallocz(sizeof(SwsUOpList));
}
int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop)
{
if (!av_dynarray2_add((void **) &uops->ops, &uops->num_ops,
sizeof(*uop), (uint8_t *) uop))
{
uop_uninit(uop);
return AVERROR(ENOMEM);
}
*uop = (SwsUOp) {0};
return 0;
}
int ff_sws_dither_height(const SwsDitherUOp *dither)
{
int max_offset = 0;
for (int i = 0; i < 4; i++)
max_offset = FFMAX(max_offset, dither->y_offset[i]);
return (1 << dither->size_log2) + max_offset;
}
static SwsPixelType pixel_type_to_int(const SwsPixelType type)
{
switch (ff_sws_pixel_type_size(type)) {
case 1: return SWS_PIXEL_U8;
case 2: return SWS_PIXEL_U16;
case 4: return SWS_PIXEL_U32;
default: break;
}
av_unreachable("Invalid pixel type!");
return SWS_PIXEL_NONE;
}
static bool exact_product_f32(float a, float b)
{
volatile float prod = a * b;
volatile float result = b ? prod / b : 0.0f;
return !b || result == a;
}
static bool exact_prod(SwsPixelType type, SwsPixel coef,
const SwsComps *comps, int idx)
{
const AVRational minq = comps->min[idx];
const AVRational maxq = comps->max[idx];
if (ff_sws_pixel_type_is_int(type))
return true;
else if (!minq.den || !maxq.den)
return false; /* unknown bounds */
const SwsPixel min = pixel_from_q(type, minq);
const SwsPixel max = pixel_from_q(type, maxq);
switch (type) {
case SWS_PIXEL_F32:
return exact_product_f32(coef.f32, min.f32) &&
exact_product_f32(coef.f32, max.f32);
}
av_unreachable("Invalid pixel type!");
return false;
}
static bool check_filter_fma(SwsContext *ctx, SwsUOpFlags flags, const SwsOp *op)
{
if (!(flags & SWS_UOP_FLAG_FMA))
return false;
if (!(ctx->flags & SWS_BITEXACT))
return true;
if (!ff_sws_pixel_type_is_int(op->type))
return false;
const int bits = ff_sws_pixel_type_size(op->type) * 8;
const uint64_t max_val = UINT64_MAX >> (64 - bits);
/* Maximum value representable losslessly as float. Note that this is
* currently true only for U8, but that may change if we ever update the
* value of SWS_FILTER_SCALE. */
return max_val * SWS_FILTER_SCALE <= (1 << 22);
}
static int translate_rw_op(SwsContext *ctx, SwsUOpList *ops, SwsUOpFlags flags,
const SwsOp *op)
{
SwsUOp uop = {
.type = op->type,
.mask = SWS_COMP_MASK(op->rw.elems > 0, op->rw.elems > 1,
op->rw.elems > 2, op->rw.elems > 3),
};
/* Non-filtered reads don't care about the exact pixel contents */
if (!op->rw.filter.op)
uop.type = pixel_type_to_int(op->type);
const bool is_read = op->op == SWS_OP_READ;
if (op->rw.filter.op) {
if (op->op == SWS_OP_WRITE || op->rw.frac || op->rw.mode != SWS_RW_PLANAR)
return AVERROR(ENOTSUP);
uop.par.filter.type = op->rw.filter.type;
uop.data.kernel = av_refstruct_ref(op->rw.filter.kernel);
if (op->rw.filter.op == SWS_OP_FILTER_H) {
uop.uop = SWS_UOP_READ_PLANAR_FH;
} else if (check_filter_fma(ctx, flags, op)) {
uop.uop = SWS_UOP_READ_PLANAR_FV_FMA;
} else {
uop.uop = SWS_UOP_READ_PLANAR_FV;
}
} else if (op->rw.mode == SWS_RW_PACKED && op->rw.elems > 1) {
if (op->rw.frac)
return AVERROR(ENOTSUP);
uop.uop = is_read ? SWS_UOP_READ_PACKED : SWS_UOP_WRITE_PACKED;
} else if (op->rw.frac == 3) {
uop.uop = is_read ? SWS_UOP_READ_BIT : SWS_UOP_WRITE_BIT;
} else if (op->rw.frac == 1) {
uop.uop = is_read ? SWS_UOP_READ_NIBBLE : SWS_UOP_WRITE_NIBBLE;
} else {
av_assert0(!op->rw.frac);
uop.uop = is_read ? SWS_UOP_READ_PLANAR : SWS_UOP_WRITE_PLANAR;
}
return ff_sws_uop_list_append(ops, &uop);
}
static int count_idx(const int *arr, size_t size, int val)
{
int num = 0;
for (size_t i = 0; i < size; i++) {
if (arr[i] == val)
num++;
}
return num;
}
static int translate_move(SwsUOpList *ops, const SwsOp *op)
{
SwsUOp uop = {
.uop = SWS_UOP_MOVE,
.type = pixel_type_to_int(op->type),
};
SwsMoveUOp *par = &uop.par.move;
/* Mask of components that are not yet satisfied */
SwsCompMask todo = ff_sws_comp_mask_needed(op);
for (int i = 0; i < 4; i++) {
if (op->swizzle.in[i] == i)
todo &= ~SWS_COMP(i);
}
/* Mask of components whose value is required for the final output */
SwsCompMask needed = 0;
for (int i = 0; i < 4; i++) {
if (SWS_OP_NEEDED(op, i))
needed |= SWS_COMP(op->swizzle.in[i]);
}
/* Current mapping of registers to components */
int idx[4 + 1] = { 0, 1, 2, 3, -1 }; /* +1 for tmp */
/* Decompose the swizzle mask into a series of register-register moves */
while (todo) {
int dst = -1, src = -1;
/* Find next unsatisfied dst <- src move that doesn't clobber a value */
for (dst = 0; dst < 4; dst++) {
if (!SWS_COMP_TEST(todo, dst))
continue; /* already satisfied */
const int cur = idx[dst];
if (count_idx(idx, FF_ARRAY_ELEMS(idx), cur) == 1 && SWS_COMP_TEST(needed, cur))
continue; /* clobbers last remaining, still-needed value */
for (src = 0; src < FF_ARRAY_ELEMS(idx); src++) {
if (idx[src] == op->swizzle.in[dst]) {
/* Prevent read-after-write dependency. */
if (par->num_moves > 0 && src == par->dst[par->num_moves - 1])
src = par->src[par->num_moves - 1];
break;
}
}
av_assert1(src < FF_ARRAY_ELEMS(idx));
todo &= ~SWS_COMP(dst);
break;
}
if (dst == 4) {
/* Stuck in a cycle, break it by saving to the scratch register */
dst = 4;
for (src = 0; src < 4; src++) {
if (SWS_COMP_TEST(todo, src)) {
needed &= ~SWS_COMP(idx[src]);
break;
}
}
av_assert1(src < 4);
}
av_assert0(par->num_moves < SWS_UOP_MOVE_MAX);
par->dst[par->num_moves] = dst > 3 ? -1 : dst;
par->src[par->num_moves] = src > 3 ? -1 : src;
par->num_moves++;
idx[dst] = idx[src];
}
return ff_sws_uop_list_append(ops, &uop);
}
static int translate_swizzle(SwsUOpList *ops, SwsUOpFlags flags, const SwsOp *op)
{
if (flags & SWS_UOP_FLAG_MOVE)
return translate_move(ops, op);
SwsUOp uop = {
.type = pixel_type_to_int(op->type),
.uop = SWS_UOP_PERMUTE,
.mask = ff_sws_comp_mask_needed(op),
.par.swizzle.in = {0, 1, 2, 3},
};
SwsCompMask seen = 0;
for (int i = 0; i < 4; i++) {
if (!SWS_COMP_TEST(uop.mask, i))
continue;
const int src = op->swizzle.in[i];
if (SWS_COMP_TEST(seen, src))
uop.uop = SWS_UOP_COPY; /* Swizzle mask contains duplicates */
seen |= SWS_COMP(src);
uop.par.swizzle.in[i] = src;
}
if (uop.uop == SWS_UOP_PERMUTE) {
/* Prevent overlap by moving unused components to unseen indices */
for (int i = 0; i < 4; i++) {
if (SWS_COMP_TEST(uop.mask, i))
continue;
/* Prefer identity mapping if possible */
int unused = i;
if (SWS_COMP_TEST(seen, i)) {
for (int j = 0; j < 4; j++) {
if (!SWS_COMP_TEST(seen, j)) {
unused = j;
break;
}
}
}
uop.par.swizzle.in[i] = unused;
seen |= SWS_COMP(unused);
}
}
/* Remove remaining trivial / identity components from the mask */
for (int i = 0; i < 4; i++) {
if (uop.par.swizzle.in[i] == i)
uop.mask &= ~SWS_COMP(i);
}
return ff_sws_uop_list_append(ops, &uop);
}
static int translate_dither_op(SwsUOpList *ops, const SwsOp *op)
{
SwsUOp uop = {
.type = op->type,
.uop = SWS_UOP_DITHER,
.par.dither.size_log2 = op->dither.size_log2,
};
if (op->dither.size_log2 == 0) {
/* Constant offset */
const SwsPixel val = Q2PIXEL(op->dither.matrix[0]);
uop.uop = SWS_UOP_ADD;
for (int i = 0; i < 4; i++) {
if (!SWS_OP_NEEDED(op, i) || op->dither.y_offset[i] < 0)
continue;
uop.mask |= SWS_COMP(i);
uop.data.vec4[i] = val;
}
return ff_sws_uop_list_append(ops, &uop);
}
const int size = 1 << op->dither.size_log2;
for (int i = 0; i < 4; i++) {
if (!SWS_OP_NEEDED(op, i) || op->dither.y_offset[i] < 0)
continue;
const uint8_t off = op->dither.y_offset[i] & (size - 1);
uop.mask |= SWS_COMP(i);
uop.par.dither.y_offset[i] = off;
}
/* Allocate extra rows to allow over-reading for row offsets. Note that
* y_offset is currently never larger than 5, so the extra space needed
* for this over-allocation is bounded by 5 * size * sizeof(float),
* typically 320 bytes for a 16x16 dither matrix. */
const int stride = size * sizeof(SwsPixel);
const int num_rows = ff_sws_dither_height(&uop.par.dither);
SwsPixel *matrix = uop.data.ptr = av_refstruct_allocz(num_rows * stride);
if (!matrix)
return AVERROR(ENOMEM);
for (int i = 0; i < size * size; i++)
matrix[i] = Q2PIXEL(op->dither.matrix[i]);
memcpy(&matrix[size * size], matrix, (num_rows - size) * stride);
return ff_sws_uop_list_append(ops, &uop);
}
static int translate_linear_op(SwsContext *ctx, SwsUOpList *ops,
SwsUOpFlags flags, const SwsOp *op,
const SwsComps *input)
{
SwsUOp uop = {
.type = op->type,
.uop = SWS_UOP_LINEAR,
};
const bool bitexact = ctx->flags & SWS_BITEXACT;
uint32_t exact = 0;
for (int i = 0; i < 4; i++) {
if (SWS_OP_NEEDED(op, i) && (op->lin.mask & SWS_MASK_ROW(i)))
uop.mask |= SWS_COMP(i);
for (int j = 0; j < 5; j++) {
const AVRational k = op->lin.m[i][j];
const SwsPixel px = Q2PIXEL(k);
uop.data.mat4[i][j] = px;
if (k.num == 0)
uop.par.lin.zero |= SWS_MASK(i, j);
else if (k.num == k.den)
uop.par.lin.one |= SWS_MASK(i, j);
else if (j < 4 && (!bitexact || exact_prod(uop.type, px, input, j)))
exact |= SWS_MASK(i, j);
}
}
if (flags & SWS_UOP_FLAG_FMA) {
/* multiplication by 1 and 0 are always exact by definition */
uop.uop = SWS_UOP_LINEAR_FMA;
uop.par.lin.exact = exact | uop.par.lin.zero | uop.par.lin.one;
}
return ff_sws_uop_list_append(ops, &uop);
}
static bool is_expand_bit(SwsPixelType type, AVRational factor)
{
if (factor.den != 1)
return false;
switch (type) {
case SWS_PIXEL_U8: return factor.num == UINT8_MAX;
case SWS_PIXEL_U16: return factor.num == UINT16_MAX;
case SWS_PIXEL_U32: return factor.num == UINT32_MAX;
case SWS_PIXEL_F32: return false;
case SWS_PIXEL_NONE:
case SWS_PIXEL_TYPE_NB: break;
}
av_unreachable("Invalid pixel type!");
return false;
}
static int translate_op(SwsContext *ctx, SwsUOpList *uops, SwsUOpFlags flags,
const SwsOp *op, const SwsComps *input)
{
switch (op->op) {
case SWS_OP_FILTER_H:
case SWS_OP_FILTER_V:
return AVERROR(ENOTSUP); /* always handled by subpass splitting */
case SWS_OP_READ:
case SWS_OP_WRITE:
return translate_rw_op(ctx, uops, flags, op);
case SWS_OP_SWIZZLE:
return translate_swizzle(uops, flags, op);
case SWS_OP_DITHER:
return translate_dither_op(uops, op);
case SWS_OP_LINEAR:
return translate_linear_op(ctx, uops, flags, op, input);
default:
break;
}
/* Default handling for "simple" ops */
SwsUOp uop = {
.type = op->type,
.uop = SWS_UOP_INVALID,
.mask = ff_sws_comp_mask_needed(op),
};
switch (op->op) {
case SWS_OP_CONVERT:
if (op->convert.expand) {
av_assert0(op->type == SWS_PIXEL_U8);
switch (op->convert.to) {
case SWS_PIXEL_U16: uop.uop = SWS_UOP_EXPAND_PAIR; break;
case SWS_PIXEL_U32: uop.uop = SWS_UOP_EXPAND_QUAD; break;
}
} else {
switch (op->convert.to) {
case SWS_PIXEL_U8: uop.uop = SWS_UOP_TO_U8; break;
case SWS_PIXEL_U16: uop.uop = SWS_UOP_TO_U16; break;
case SWS_PIXEL_U32: uop.uop = SWS_UOP_TO_U32; break;
case SWS_PIXEL_F32: uop.uop = SWS_UOP_TO_F32; break;
}
}
break;
case SWS_OP_UNPACK:
case SWS_OP_PACK:
uop.uop = op->op == SWS_OP_PACK ? SWS_UOP_PACK : SWS_UOP_UNPACK;
uop.mask = 0;
for (int i = 0; i < 4 && op->pack.pattern[i]; i++) {
uop.par.pack.pattern[i] = op->pack.pattern[i];
uop.mask |= SWS_COMP(i);
}
break;
case SWS_OP_LSHIFT:
case SWS_OP_RSHIFT:
uop.uop = op->op == SWS_OP_LSHIFT ? SWS_UOP_LSHIFT : SWS_UOP_RSHIFT;
uop.par.shift.amount = op->shift.amount;
break;
case SWS_OP_CLEAR:
uop.uop = SWS_UOP_CLEAR;
uop.type = pixel_type_to_int(op->type);
uop.mask &= op->clear.mask;
for (int i = 0; i < 4; i++) {
if (!SWS_COMP_TEST(op->clear.mask, i))
continue;
const AVRational v = op->clear.value[i];
const SwsPixel px = Q2PIXEL(op->clear.value[i]);
uop.data.vec4[i] = px;
if (v.num == 0)
uop.par.clear.zero |= SWS_COMP(i);
else if (pixel_is_1s(op->type, px))
uop.par.clear.one |= SWS_COMP(i);
}
break;
case SWS_OP_SCALE:
if (is_expand_bit(op->type, op->scale.factor)) {
uop.uop = SWS_UOP_EXPAND_BIT;
} else {
uop.uop = SWS_UOP_SCALE;
uop.data.scalar = Q2PIXEL(op->scale.factor);
}
break;
case SWS_OP_MIN:
case SWS_OP_MAX:
uop.uop = op->op == SWS_OP_MIN ? SWS_UOP_MIN : SWS_UOP_MAX;
uop.mask &= ff_sws_comp_mask_q4(op->clamp.limit);
for (int i = 0; i < 4; i++) {
if (SWS_COMP_TEST(uop.mask, i))
uop.data.vec4[i] = Q2PIXEL(op->clamp.limit[i]);
}
break;
case SWS_OP_SWAP_BYTES:
uop.uop = SWS_UOP_SWAP_BYTES;
uop.type = pixel_type_to_int(op->type);
break;
default:
return AVERROR(ENOTSUP);
}
av_assert0(uop.uop != SWS_UOP_INVALID);
return ff_sws_uop_list_append(uops, &uop);
}
int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops,
SwsUOpFlags flags, SwsUOpList *uops)
{
SwsComps input = ops->comps_src;
for (int i = 0; i < ops->num_ops; i++) {
int ret = translate_op(ctx, uops, flags, &ops->ops[i], &input);
if (ret < 0)
return ret;
input = ops->ops[i].comps;
}
return 0;
}
static int register_uop(struct AVTreeNode **root, const SwsUOp *uop)
{
SwsUOp *key = av_memdup(uop, sizeof(*uop));
if (!key)
return AVERROR(ENOMEM);
memset(&key->data, 0, sizeof(key->data));
struct AVTreeNode *node = av_tree_node_alloc();
if (!node) {
av_free(key);
return AVERROR(ENOMEM);
}
av_tree_insert(root, key, ff_sws_uop_cmp_v, &node);
if (node) {
av_free(node);
av_free(key);
}
return 0;
}
static int register_flags(SwsContext *ctx, const SwsOpList *ops, SwsUOpFlags flags)
{
SwsUOpList *uops = ff_sws_uop_list_alloc();
if (!uops)
return AVERROR(ENOMEM);
int ret = ff_sws_ops_translate(ctx, ops, flags, uops);
if (ret < 0)
goto fail;
struct AVTreeNode **root = ctx->opaque;
for (int i = 0; i < uops->num_ops; i++) {
ret = register_uop(root, &uops->ops[i]);
if (ret < 0)
goto fail;
}
fail:
ff_sws_uop_list_free(&uops);
return ret;
}
static const SwsUOpFlags uop_flags[] = {
0,
SWS_UOP_FLAG_FMA | SWS_UOP_FLAG_MOVE, /* x86 backend */
};
static int register_uops(SwsContext *ctx, const SwsOpList *ops,
SwsCompiledOp *out)
{
for (int i = 0; i < FF_ARRAY_ELEMS(uop_flags); i++) {
int ret = register_flags(ctx, ops, uop_flags[i]);
if (ret < 0)
return ret;
}
*out = (SwsCompiledOp) {0}; /* dummy value, will be immediately freed */
return 0;
}
/* Dummy backend that just registers all seen uops */
static const SwsOpBackend backend_uops = {
.name = "uops_gen",
.compile = register_uops,
};
static int register_all_uops(SwsContext *ctx, void *graph, SwsOpList *ops)
{
/* ff_sws_compile_pass() takes over ownership of `ops` */
SwsOpList *copy = ff_sws_op_list_duplicate(ops);
if (!copy)
return AVERROR(ENOMEM);
return ff_sws_compile_pass(graph, &backend_uops, &copy, 0, NULL, NULL);
}
static const SwsFlags flags[] = {
0,
SWS_ACCURATE_RND, /* may insert extra 1x1 dither ops (for accurate rounding) */
SWS_BITEXACT, /* prevents some FMA optimizations */
SWS_ACCURATE_RND | SWS_BITEXACT,
};
/* Limit the range of av_tree_enumerate() to only matching uop and type */
static int enum_type(void *opaque, void *elem)
{
const SwsUOp *a = opaque, *b = elem;
if (a->type != b->type)
return (int) b->type - a->type;
if (a->uop != b->uop)
return (int) b->uop - a->uop;
return 0;
}
static int free_uop_key(void *opaque, void *key)
{
av_free(key);
return 0;
}
int ff_sws_uops_macros_gen(char **out_str)
{
int ret;
struct AVTreeNode *root = NULL;
AVBPrint bprint, *const bp = &bprint;
av_bprint_init(bp, 0, AV_BPRINT_SIZE_UNLIMITED);
/* Allocate dummy graph and context for ff_sws_compile_pass() */
SwsGraph *graph = ff_sws_graph_alloc();
if (!graph)
return AVERROR(ENOMEM);
SwsContext *ctx = graph->ctx = sws_alloc_context();
if (!ctx) {
ret = AVERROR(ENOMEM);
goto fail;
}
/* Use this to plumb the tree state through all the layers of abstraction */
ctx->opaque = &root;
ctx->scaler = SWS_SCALE_BILINEAR; /* cheaper to generate filter kernels */
/* Register all unique uops over every relevant combination of flags */
for (int i = 0; i < FF_ARRAY_ELEMS(flags); i++) {
ctx->flags = flags[i];
ret = ff_sws_enum_op_lists(ctx, graph, AV_PIX_FMT_NONE, AV_PIX_FMT_NONE,
register_all_uops);
if (ret < 0)
goto fail;
}
/**
* Additionally make sure planar reads/writes are always available for all
* formats, because checkasm depends on them to be able to verify the
* input/output of any other operations.
*/
for (enum SwsPixelType type = SWS_PIXEL_NONE+1; type < SWS_PIXEL_TYPE_NB; type++) {
if (!ff_sws_pixel_type_is_int(type))
continue;
for (int elems = 1; elems <= 4; elems++) {
for (int rw = 0; rw < 2; rw++) {
SwsUOp uop = {
.type = type,
.uop = rw ? SWS_UOP_WRITE_PLANAR : SWS_UOP_READ_PLANAR,
.mask = SWS_COMP_ELEMS(elems),
};
ret = register_uop(&root, &uop);
if (ret < 0)
goto fail;
}
}
}
#define BPRINT_STR(str) av_bprint_append_data(bp, str, strlen(str))
BPRINT_STR(
"/**\n"
" * This file is automatically generated. Do not edit manually.\n"
" * To regenerate, run: make fate-sws-uops-macros GEN=1\n"
" */\n"
"\n"
"#ifndef SWSCALE_UOPS_MACROS_H\n"
"#define SWSCALE_UOPS_MACROS_H\n"
"\n"
"/**\n"
" * Boilerplate helper macros, for template-based backends. These will be\n"
" * instantiated like this, with parameters in struct order:\n"
" * MACRO(__VA_ARGS__, NAME, UOP, TYPE, MASK, [PARAMS,])\n"
" * The _STRUCT variants pass all arguments in C struct syntax, while the\n"
" * plain variants give them as separate C values (e.g. for use in calls)\n"
" */\n"
"#define SWS_GLUE3(x, y, z) x ## _ ## y ## _ ## z\n"
"#define SWS_FOR(TYPE, UOP, MACRO, ...) \\\n"
" SWS_GLUE3(SWS_FOR, TYPE, UOP)(MACRO, __VA_ARGS__)\n"
"#define SWS_FOR_STRUCT(TYPE, UOP, MACRO, ...) \\\n"
" SWS_GLUE3(SWS_FOR_STRUCT, TYPE, UOP)(MACRO, __VA_ARGS__)\n"
"\n");
SwsUOp key = { .data.opaque = bp };
for (key.type = SWS_PIXEL_NONE + 1; key.type < SWS_PIXEL_TYPE_NB; key.type++) {
for (key.uop = SWS_UOP_INVALID + 1; key.uop < SWS_UOP_TYPE_NB; key.uop++) {
const char *macro = uop_names[key.uop].macro;
const char *prefix = pixel_types[key.type].prefix;
av_bprintf(bp, "#define SWS_FOR_%s%s(MACRO, ...)", prefix, macro);
av_tree_enumerate(root, &key, enum_type, generate_entry_args);
av_bprintf(bp, "\n");
av_bprintf(bp, "#define SWS_FOR_STRUCT_%s%s(MACRO, ...)", prefix, macro);
av_tree_enumerate(root, &key, enum_type, generate_entry_struct);
av_bprintf(bp, "\n");
}
}
BPRINT_STR("\n#endif /* SWSCALE_UOPS_MACROS_H */");
ret = av_bprint_finalize(bp, out_str);
fail:
av_bprint_finalize(bp, NULL);
av_tree_enumerate(root, NULL, NULL, free_uop_key);
av_tree_destroy(root);
ff_sws_graph_free(&graph);
sws_free_context(&ctx);
return ret;
}