ffmpeg/libswscale/uops.c

/**
 * Copyright (C) 2026 Niklas Haas
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <stdbool.h>

#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/refstruct.h"
#include "libavutil/tree.h"

#include "ops.h"
#include "ops_internal.h"
#include "uops.h"

int ff_sws_uop_cmp(const SwsUOp *a, const SwsUOp *b)
{
    if (a->type != b->type)
        return (int) a->type - b->type;
    if (a->uop != b->uop)
        return (int) a->uop - b->uop;
    if (a->mask != b->mask)
        return (int) a->mask - b->mask;
    return memcmp(&a->par, &b->par, sizeof(a->par));
}

static const struct {
    char full[32];
    char abbr[32];
    char macro[32];
} uop_names[SWS_UOP_TYPE_NB] = {
#define UOP_NAME(OP, ABBR) [SWS_UOP_##OP] = { "SWS_UOP_" #OP, ABBR, #OP }
    UOP_NAME(INVALID,           "invalid"),
    UOP_NAME(READ_PLANAR,       "read_planar"),
    UOP_NAME(READ_PLANAR_FH,    "read_planar_fh"),
    UOP_NAME(READ_PLANAR_FV,    "read_planar_fv"),
    UOP_NAME(READ_PLANAR_FV_FMA,"read_planar_fv_fma"),
    UOP_NAME(READ_PACKED,       "read_packed"),
    UOP_NAME(READ_NIBBLE,       "read_nibble"),
    UOP_NAME(READ_BIT,          "read_bit"),
    UOP_NAME(WRITE_PLANAR,      "write_planar"),
    UOP_NAME(WRITE_PACKED,      "write_packed"),
    UOP_NAME(WRITE_NIBBLE,      "write_nibble"),
    UOP_NAME(WRITE_BIT,         "write_bit"),
    UOP_NAME(PERMUTE,           "permute"),
    UOP_NAME(COPY,              "copy"),
    UOP_NAME(MOVE,              "move"),
    UOP_NAME(SWAP_BYTES,        "swap_bytes"),
    UOP_NAME(EXPAND_BIT,        "expand_bit"),
    UOP_NAME(EXPAND_PAIR,       "expand_pair"),
    UOP_NAME(EXPAND_QUAD,       "expand_quad"),
    UOP_NAME(TO_U8,             "to_u8"),
    UOP_NAME(TO_U16,            "to_u16"),
    UOP_NAME(TO_U32,            "to_u32"),
    UOP_NAME(TO_F32,            "to_f32"),
    UOP_NAME(SCALE,             "scale"),
    UOP_NAME(LINEAR,            "linear"),
    UOP_NAME(LINEAR_FMA,        "linear_fma"),
    UOP_NAME(ADD,               "add"),
    UOP_NAME(MIN,               "min"),
    UOP_NAME(MAX,               "max"),
    UOP_NAME(UNPACK,            "unpack"),
    UOP_NAME(PACK,              "pack"),
    UOP_NAME(LSHIFT,            "lshift"),
    UOP_NAME(RSHIFT,            "rshift"),
    UOP_NAME(CLEAR,             "clear"),
    UOP_NAME(DITHER,            "dither"),
#undef UOP_NAME
};

static const struct {
    char full[16];
    char prefix[8];
} pixel_types[SWS_PIXEL_TYPE_NB] = {
    [SWS_PIXEL_NONE] = { "SWS_PIXEL_NONE", ""     },
    [SWS_PIXEL_U8]   = { "SWS_PIXEL_U8",   "U8_"  },
    [SWS_PIXEL_U16]  = { "SWS_PIXEL_U16",  "U16_" },
    [SWS_PIXEL_U32]  = { "SWS_PIXEL_U32",  "U32_" },
    [SWS_PIXEL_F32]  = { "SWS_PIXEL_F32",  "F32_" },
};

static SwsPixel pixel_from_q(SwsPixelType type, AVRational val)
{
    av_assert1(val.den != 0);
    switch (type) {
    case SWS_PIXEL_U8:  return (SwsPixel) { .u8  = val.num / val.den };
    case SWS_PIXEL_U16: return (SwsPixel) { .u16 = val.num / val.den };
    case SWS_PIXEL_U32: return (SwsPixel) { .u32 = val.num / val.den };
    case SWS_PIXEL_F32: return (SwsPixel) { .f32 = (float) val.num / val.den };
    case SWS_PIXEL_NONE:
    case SWS_PIXEL_TYPE_NB: break;
    }

    av_unreachable("Invalid pixel type!");
    return (SwsPixel) {0};
}

#define Q2PIXEL(val) pixel_from_q(op->type, val)

static bool pixel_is_1s(SwsPixelType type, SwsPixel val)
{
    switch (ff_sws_pixel_type_size(type)) {
    case 1: return val.u8  == UINT8_MAX;
    case 2: return val.u16 == UINT16_MAX;
    case 4: return val.u32 == UINT32_MAX;
    default: break;
    }

    av_unreachable("Invalid pixel type!");
    return false;
}

void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX])
{
    AVBPrint bp;
    av_bprint_init_for_buffer(&bp, buf, SWS_UOP_NAME_MAX);

    if (op->type != SWS_PIXEL_NONE)
        av_bprintf(&bp, "%s_", ff_sws_pixel_type_name(op->type));
    av_bprintf(&bp, "%s", uop_names[op->uop].abbr);

    if (op->mask) {
        av_bprint_chars(&bp, '_', 1);
        for (int i = 0; i < 4; i++) {
            if (SWS_COMP_TEST(op->mask, i))
                av_bprint_chars(&bp, "xyzw"[i], 1);
        }
    }

    const SwsUOpParams *par = &op->par;
    switch (op->uop) {
    case SWS_UOP_READ_PLANAR_FH:
    case SWS_UOP_READ_PLANAR_FV:
    case SWS_UOP_READ_PLANAR_FV_FMA:
        av_bprintf(&bp, "_%s", ff_sws_pixel_type_name(par->filter.type));
        break;
    case SWS_UOP_LSHIFT:
    case SWS_UOP_RSHIFT:
        av_bprintf(&bp, "_%u", par->shift.amount);
        break;
    case SWS_UOP_PERMUTE:
    case SWS_UOP_COPY:
        av_bprint_chars(&bp, '_', 1);
        for (int i = 0; i < 4; i++) {
            if (SWS_COMP_TEST(op->mask, i))
                av_bprint_chars(&bp, "xyzw"[par->swizzle.in[i]], 1);
        }
        break;
    case SWS_UOP_MOVE:
        av_bprint_chars(&bp, '_', 1);
        for (int i = 0; i < par->move.num_moves; i++)
            av_bprint_chars(&bp, "txyzw"[par->move.dst[i] + 1], 1);
        av_bprint_chars(&bp, '_', 1);
        for (int i = 0; i < par->move.num_moves; i++)
            av_bprint_chars(&bp, "txyzw"[par->move.src[i] + 1], 1);
        break;
    case SWS_UOP_PACK:
    case SWS_UOP_UNPACK:
        av_bprint_chars(&bp, '_', 1);
        for (int i = 0; i < 4 && par->pack.pattern[i]; i++)
            av_bprintf(&bp, "%x", par->pack.pattern[i]);
        break;
    case SWS_UOP_CLEAR:
        av_bprint_chars(&bp, '_', 1);
        for (int i = 0; i < 4; i++) {
            if (!SWS_COMP_TEST(op->mask, i))
                continue;
            else if (SWS_COMP_TEST(par->clear.one, i))
                av_bprint_chars(&bp, '1', 1);
            else if (SWS_COMP_TEST(par->clear.zero, i))
                av_bprint_chars(&bp, '0', 1);
            else
                av_bprint_chars(&bp, 'x', 1);
        }
        break;
    case SWS_UOP_LINEAR:
    case SWS_UOP_LINEAR_FMA:
        for (int i = 0; i < 4; i++) {
            if (!SWS_COMP_TEST(op->mask, i))
                continue;
            av_bprint_chars(&bp, '_', 1);
            for (int j = 0; j < 5; j++) {
                if (par->lin.one & SWS_MASK(i, j))
                    av_bprint_chars(&bp, '1', 1);
                else if (par->lin.zero & SWS_MASK(i, j))
                    av_bprint_chars(&bp, '0', 1);
                else if (par->lin.exact & SWS_MASK(i, j))
                    av_bprint_chars(&bp, 'X', 1);
                else
                    av_bprint_chars(&bp, 'x', 1);
            }
        }
        break;
    case SWS_UOP_DITHER:
        for (int i = 0; i < 4; i++) {
            if (SWS_COMP_TEST(op->mask, i))
                av_bprintf(&bp, "_%d", par->dither.y_offset[i]);
        }
        const unsigned size = 1u << par->dither.size_log2;
        av_bprintf(&bp, "_%ux%u", size, size);
        break;
    }

    av_assert0(av_bprint_is_complete(&bp));
}

static int generate_entry_struct(void *opaque, void *key)
{
    const SwsUOp *ref = opaque;
    const SwsUOp *uop = key;
    AVBPrint *bp = ref->data.opaque;
    char name[SWS_UOP_NAME_MAX];
    ff_sws_uop_name(uop, name);
    av_bprintf(bp, " \\\n    MACRO(__VA_ARGS__, %-40s", name);
    av_bprintf(bp, ", .type = %-13s, .uop = %-24s, .mask = 0x%x",
               pixel_types[uop->type].full, uop_names[uop->uop].full, uop->mask);

    const SwsUOpParams *par = &uop->par;
    switch (uop->uop) {
    case SWS_UOP_READ_PLANAR_FH:
    case SWS_UOP_READ_PLANAR_FV:
    case SWS_UOP_READ_PLANAR_FV_FMA:
        av_bprintf(bp, ", .par.filter.type = %s", pixel_types[par->filter.type].full);
        break;
    case SWS_UOP_LSHIFT:
    case SWS_UOP_RSHIFT:
        av_bprintf(bp, ", .par.shift.amount = %u", par->shift.amount);
        break;
    case SWS_UOP_PERMUTE:
    case SWS_UOP_COPY:
        av_bprintf(bp, ", .par.swizzle.in = {%d, %d, %d, %d}",
                   par->swizzle.in[0], par->swizzle.in[1],
                   par->swizzle.in[2], par->swizzle.in[3]);
        break;
    case SWS_UOP_MOVE:
        av_bprintf(bp, ", .par.move.num_moves = %d", par->move.num_moves);
        av_bprintf(bp, ", .par.move.dst = {%d, %d, %d, %d, %d, %d}",
                   par->move.dst[0], par->move.dst[1], par->move.dst[2],
                   par->move.dst[3], par->move.dst[4], par->move.dst[5]);
        av_bprintf(bp, ", .par.move.src = {%d, %d, %d, %d, %d, %d}",
                   par->move.src[0], par->move.src[1], par->move.src[2],
                   par->move.src[3], par->move.src[4], par->move.src[5]);
        break;
    case SWS_UOP_PACK:
    case SWS_UOP_UNPACK:
        av_bprintf(bp, ", .par.pack.pattern = {%d, %d, %d, %d}",
                   par->pack.pattern[0], par->pack.pattern[1],
                   par->pack.pattern[2], par->pack.pattern[3]);
        break;
    case SWS_UOP_CLEAR:
        av_bprintf(bp, ", .par.clear.one = 0x%x, .par.clear.zero = 0x%x",
                   par->clear.one, par->clear.zero);
        break;
    case SWS_UOP_LINEAR:
    case SWS_UOP_LINEAR_FMA:
        av_bprintf(bp, ", .par.lin.one = 0x%x, .par.lin.zero = 0x%x",
                   par->lin.one, par->lin.zero);
        if (uop->uop == SWS_UOP_LINEAR_FMA)
            av_bprintf(bp, ", .par.lin.exact = 0x%x", par->lin.exact);
        break;
    case SWS_UOP_DITHER:
        av_bprintf(bp, ", .par.dither = { .y_offset = {%u, %u, %u, %u}, .size_log2 = %u }",
                   par->dither.y_offset[0], par->dither.y_offset[1],
                   par->dither.y_offset[2], par->dither.y_offset[3],
                   par->dither.size_log2);
        break;
    }

    av_bprintf(bp, ")");
    return 0;
}

static int generate_entry_args(void *opaque, void *key)
{
    const SwsUOp *ref = opaque;
    const SwsUOp *uop = key;
    AVBPrint *bp = ref->data.opaque;
    char name[SWS_UOP_NAME_MAX];
    ff_sws_uop_name(uop, name);
    av_bprintf(bp, " \\\n    MACRO(__VA_ARGS__, %-40s, %-13s, %-24s, 0x%x",
               name, pixel_types[uop->type].full, uop_names[uop->uop].full, uop->mask);

    const SwsUOpParams *par = &uop->par;
    switch (uop->uop) {
    case SWS_UOP_READ_PLANAR_FH:
    case SWS_UOP_READ_PLANAR_FV:
    case SWS_UOP_READ_PLANAR_FV_FMA:
        av_bprintf(bp, ", %s", pixel_types[par->filter.type].full);
        break;
    case SWS_UOP_LSHIFT:
    case SWS_UOP_RSHIFT:
        av_bprintf(bp, ", %u", par->shift.amount);
        break;
    case SWS_UOP_PERMUTE:
    case SWS_UOP_COPY:
        av_bprintf(bp, ", %d, %d, %d, %d",
                   par->swizzle.in[0], par->swizzle.in[1],
                   par->swizzle.in[2], par->swizzle.in[3]);
        break;
    case SWS_UOP_MOVE:
        av_bprintf(bp, ", %d", par->move.num_moves);
        av_bprintf(bp, ", %d, %d, %d, %d, %d, %d",
                   par->move.dst[0], par->move.dst[1], par->move.dst[2],
                   par->move.dst[3], par->move.dst[4], par->move.dst[5]);
        av_bprintf(bp, ", %d, %d, %d, %d, %d, %d",
                   par->move.src[0], par->move.src[1], par->move.src[2],
                   par->move.src[3], par->move.src[4], par->move.src[5]);
        break;
    case SWS_UOP_PACK:
    case SWS_UOP_UNPACK:
        av_bprintf(bp, ", %d, %d, %d, %d",
                   par->pack.pattern[0], par->pack.pattern[1],
                   par->pack.pattern[2], par->pack.pattern[3]);
        break;
    case SWS_UOP_CLEAR:
        av_bprintf(bp, ", 0x%05x, 0x%05x", par->clear.one, par->clear.zero);
        break;
    case SWS_UOP_LINEAR:
    case SWS_UOP_LINEAR_FMA:
        av_bprintf(bp, ", 0x%05x, 0x%05x", par->lin.one, par->lin.zero);
        if (uop->uop == SWS_UOP_LINEAR_FMA)
            av_bprintf(bp, ", 0x%05x", par->lin.exact);
        break;
    case SWS_UOP_DITHER:
        av_bprintf(bp, ", %u, %u, %u, %u, %u",
                   par->dither.y_offset[0], par->dither.y_offset[1],
                   par->dither.y_offset[2], par->dither.y_offset[3],
                   par->dither.size_log2);
        break;
    }

    av_bprintf(bp, ")");
    return 0;
}

static void uop_uninit(SwsUOp *uop)
{
    switch (uop->uop) {
    case SWS_UOP_DITHER:
        av_refstruct_unref(&uop->data.ptr);
        break;
    case SWS_UOP_READ_PLANAR_FH:
    case SWS_UOP_READ_PLANAR_FV:
    case SWS_UOP_READ_PLANAR_FV_FMA:
        av_refstruct_unref(&uop->data.kernel);
        break;
    }

    *uop = (SwsUOp) {0};
}

void ff_sws_uop_list_free(SwsUOpList **p_ops)
{
    SwsUOpList *ops = *p_ops;
    if (!ops)
        return;

    for (int i = 0; i < ops->num_ops; i++)
        uop_uninit(&ops->ops[i]);

    av_freep(&ops->ops);
    av_free(ops);
    *p_ops = NULL;
}

SwsUOpList *ff_sws_uop_list_alloc(void)
{
    return av_mallocz(sizeof(SwsUOpList));
}

int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop)
{
    if (!av_dynarray2_add((void **) &uops->ops, &uops->num_ops,
                          sizeof(*uop), (uint8_t *) uop))
    {
        uop_uninit(uop);
        return AVERROR(ENOMEM);
    }

    *uop = (SwsUOp) {0};
    return 0;
}

int ff_sws_dither_height(const SwsDitherUOp *dither)
{
    int max_offset = 0;
    for (int i = 0; i < 4; i++)
        max_offset = FFMAX(max_offset, dither->y_offset[i]);
    return (1 << dither->size_log2) + max_offset;
}

static SwsPixelType pixel_type_to_int(const SwsPixelType type)
{
    switch (ff_sws_pixel_type_size(type)) {
    case 1: return SWS_PIXEL_U8;
    case 2: return SWS_PIXEL_U16;
    case 4: return SWS_PIXEL_U32;
    default: break;
    }

    av_unreachable("Invalid pixel type!");
    return SWS_PIXEL_NONE;
}

static bool exact_product_f32(float a, float b)
{
    volatile float prod   = a * b;
    volatile float result = b ? prod / b : 0.0f;
    return !b || result == a;
}

static bool exact_prod(SwsPixelType type, SwsPixel coef,
                       const SwsComps *comps, int idx)
{
    const AVRational minq = comps->min[idx];
    const AVRational maxq = comps->max[idx];
    if (ff_sws_pixel_type_is_int(type))
        return true;
    else if (!minq.den || !maxq.den)
        return false; /* unknown bounds */

    const SwsPixel min = pixel_from_q(type, minq);
    const SwsPixel max = pixel_from_q(type, maxq);
    switch (type) {
    case SWS_PIXEL_F32:
        return exact_product_f32(coef.f32, min.f32) &&
               exact_product_f32(coef.f32, max.f32);
    }

    av_unreachable("Invalid pixel type!");
    return false;
}

static bool check_filter_fma(SwsContext *ctx, SwsUOpFlags flags, const SwsOp *op)
{
    if (!(flags & SWS_UOP_FLAG_FMA))
        return false;
    if (!(ctx->flags & SWS_BITEXACT))
        return true;
    if (!ff_sws_pixel_type_is_int(op->type))
        return false;

    const int bits = ff_sws_pixel_type_size(op->type) * 8;
    const uint64_t max_val = UINT64_MAX >> (64 - bits);

    /* Maximum value representable losslessly as float. Note that this is
     * currently true only for U8, but that may change if we ever update the
     * value of SWS_FILTER_SCALE. */
    return max_val * SWS_FILTER_SCALE <= (1 << 22);
}

static int translate_rw_op(SwsContext *ctx, SwsUOpList *ops, SwsUOpFlags flags,
                           const SwsOp *op)
{
    SwsUOp uop = {
        .type = op->type,
        .mask = SWS_COMP_MASK(op->rw.elems > 0, op->rw.elems > 1,
                              op->rw.elems > 2, op->rw.elems > 3),
    };

    /* Non-filtered reads don't care about the exact pixel contents */
    if (!op->rw.filter.op)
        uop.type = pixel_type_to_int(op->type);

    const bool is_read = op->op == SWS_OP_READ;
    if (op->rw.filter.op) {
        if (op->op == SWS_OP_WRITE || op->rw.frac || op->rw.mode != SWS_RW_PLANAR)
            return AVERROR(ENOTSUP);
        uop.par.filter.type = op->rw.filter.type;
        uop.data.kernel = av_refstruct_ref(op->rw.filter.kernel);
        if (op->rw.filter.op == SWS_OP_FILTER_H) {
            uop.uop = SWS_UOP_READ_PLANAR_FH;
        } else if (check_filter_fma(ctx, flags, op)) {
            uop.uop = SWS_UOP_READ_PLANAR_FV_FMA;
        } else {
            uop.uop = SWS_UOP_READ_PLANAR_FV;
        }
    } else if (op->rw.mode == SWS_RW_PACKED && op->rw.elems > 1) {
        if (op->rw.frac)
            return AVERROR(ENOTSUP);
        uop.uop = is_read ? SWS_UOP_READ_PACKED : SWS_UOP_WRITE_PACKED;
    } else if (op->rw.frac == 3) {
        uop.uop = is_read ? SWS_UOP_READ_BIT : SWS_UOP_WRITE_BIT;
    } else if (op->rw.frac == 1) {
        uop.uop = is_read ? SWS_UOP_READ_NIBBLE : SWS_UOP_WRITE_NIBBLE;
    } else {
        av_assert0(!op->rw.frac);
        uop.uop = is_read ? SWS_UOP_READ_PLANAR : SWS_UOP_WRITE_PLANAR;
    }

    return ff_sws_uop_list_append(ops, &uop);
}

static int count_idx(const int *arr, size_t size, int val)
{
    int num = 0;
    for (size_t i = 0; i < size; i++) {
        if (arr[i] == val)
            num++;
    }

    return num;
}

static int translate_move(SwsUOpList *ops, const SwsOp *op)
{
    SwsUOp uop = {
        .uop  = SWS_UOP_MOVE,
        .type = pixel_type_to_int(op->type),
    };
    SwsMoveUOp *par = &uop.par.move;

    /* Mask of components that are not yet satisfied */
    SwsCompMask todo = ff_sws_comp_mask_needed(op);
    for (int i = 0; i < 4; i++) {
        if (op->swizzle.in[i] == i)
            todo &= ~SWS_COMP(i);
    }

    /* Mask of components whose value is required for the final output */
    SwsCompMask needed = 0;
    for (int i = 0; i < 4; i++) {
        if (SWS_OP_NEEDED(op, i))
            needed |= SWS_COMP(op->swizzle.in[i]);
    }

    /* Current mapping of registers to components */
    int idx[4 + 1] = { 0, 1, 2, 3, -1 }; /* +1 for tmp */

    /* Decompose the swizzle mask into a series of register-register moves */
    while (todo) {
        int dst = -1, src = -1;

        /* Find next unsatisfied dst <- src move that doesn't clobber a value */
        for (dst = 0; dst < 4; dst++) {
            if (!SWS_COMP_TEST(todo, dst))
                continue; /* already satisfied */
            const int cur = idx[dst];
            if (count_idx(idx, FF_ARRAY_ELEMS(idx), cur) == 1 && SWS_COMP_TEST(needed, cur))
                continue; /* clobbers last remaining, still-needed value */
            for (src = 0; src < FF_ARRAY_ELEMS(idx); src++) {
                if (idx[src] == op->swizzle.in[dst]) {
                    /* Prevent read-after-write dependency. */
                    if (par->num_moves > 0 && src == par->dst[par->num_moves - 1])
                        src = par->src[par->num_moves - 1];
                    break;
                }
            }
            av_assert1(src < FF_ARRAY_ELEMS(idx));
            todo &= ~SWS_COMP(dst);
            break;
        }

        if (dst == 4) {
            /* Stuck in a cycle, break it by saving to the scratch register */
            dst = 4;
            for (src = 0; src < 4; src++) {
                if (SWS_COMP_TEST(todo, src)) {
                    needed &= ~SWS_COMP(idx[src]);
                    break;
                }
            }
            av_assert1(src < 4);
        }

        av_assert0(par->num_moves < SWS_UOP_MOVE_MAX);
        par->dst[par->num_moves] = dst > 3 ? -1 : dst;
        par->src[par->num_moves] = src > 3 ? -1 : src;
        par->num_moves++;
        idx[dst] = idx[src];
    }

    return ff_sws_uop_list_append(ops, &uop);
}

static int translate_swizzle(SwsUOpList *ops, SwsUOpFlags flags, const SwsOp *op)
{
    if (flags & SWS_UOP_FLAG_MOVE)
        return translate_move(ops, op);

    SwsUOp uop = {
        .type = pixel_type_to_int(op->type),
        .uop  = SWS_UOP_PERMUTE,
        .mask = ff_sws_comp_mask_needed(op),
        .par.swizzle.in = {0, 1, 2, 3},
    };

    SwsCompMask seen = 0;
    for (int i = 0; i < 4; i++) {
        if (!SWS_COMP_TEST(uop.mask, i))
            continue;
        const int src = op->swizzle.in[i];
        if (SWS_COMP_TEST(seen, src))
            uop.uop = SWS_UOP_COPY; /* Swizzle mask contains duplicates */
        seen |= SWS_COMP(src);
        uop.par.swizzle.in[i] = src;
    }

    if (uop.uop == SWS_UOP_PERMUTE) {
        /* Prevent overlap by moving unused components to unseen indices */
        for (int i = 0; i < 4; i++) {
            if (SWS_COMP_TEST(uop.mask, i))
                continue;

            /* Prefer identity mapping if possible */
            int unused = i;
            if (SWS_COMP_TEST(seen, i)) {
                for (int j = 0; j < 4; j++) {
                    if (!SWS_COMP_TEST(seen, j)) {
                        unused = j;
                        break;
                    }
                }
            }

            uop.par.swizzle.in[i] = unused;
            seen |= SWS_COMP(unused);
        }
    }

    /* Remove remaining trivial / identity components from the mask */
    for (int i = 0; i < 4; i++) {
        if (uop.par.swizzle.in[i] == i)
            uop.mask &= ~SWS_COMP(i);
    }

    return ff_sws_uop_list_append(ops, &uop);
}

static int translate_dither_op(SwsUOpList *ops, const SwsOp *op)
{
    SwsUOp uop = {
        .type = op->type,
        .uop  = SWS_UOP_DITHER,
        .par.dither.size_log2 = op->dither.size_log2,
    };

    if (op->dither.size_log2 == 0) {
        /* Constant offset */
        const SwsPixel val = Q2PIXEL(op->dither.matrix[0]);
        uop.uop = SWS_UOP_ADD;
        for (int i = 0; i < 4; i++) {
            if (!SWS_OP_NEEDED(op, i) || op->dither.y_offset[i] < 0)
                continue;
            uop.mask |= SWS_COMP(i);
            uop.data.vec4[i] = val;
        }

        return ff_sws_uop_list_append(ops, &uop);
    }

    const int size = 1 << op->dither.size_log2;
    for (int i = 0; i < 4; i++) {
        if (!SWS_OP_NEEDED(op, i) || op->dither.y_offset[i] < 0)
            continue;
        const uint8_t off = op->dither.y_offset[i] & (size - 1);
        uop.mask |= SWS_COMP(i);
        uop.par.dither.y_offset[i] = off;
    }

    /* Allocate extra rows to allow over-reading for row offsets. Note that
     * y_offset is currently never larger than 5, so the extra space needed
     * for this over-allocation is bounded by 5 * size * sizeof(float),
     * typically 320 bytes for a 16x16 dither matrix. */
    const int stride   = size * sizeof(SwsPixel);
    const int num_rows = ff_sws_dither_height(&uop.par.dither);
    SwsPixel *matrix = uop.data.ptr = av_refstruct_allocz(num_rows * stride);
    if (!matrix)
        return AVERROR(ENOMEM);

    for (int i = 0; i < size * size; i++)
        matrix[i] = Q2PIXEL(op->dither.matrix[i]);
    memcpy(&matrix[size * size], matrix, (num_rows - size) * stride);

    return ff_sws_uop_list_append(ops, &uop);
}

static int translate_linear_op(SwsContext *ctx, SwsUOpList *ops,
                               SwsUOpFlags flags, const SwsOp *op,
                               const SwsComps *input)
{
    SwsUOp uop = {
        .type = op->type,
        .uop  = SWS_UOP_LINEAR,
    };

    const bool bitexact = ctx->flags & SWS_BITEXACT;
    uint32_t exact = 0;

    for (int i = 0; i < 4; i++) {
        if (SWS_OP_NEEDED(op, i) && (op->lin.mask & SWS_MASK_ROW(i)))
            uop.mask |= SWS_COMP(i);
        for (int j = 0; j < 5; j++) {
            const AVRational k = op->lin.m[i][j];
            const SwsPixel px = Q2PIXEL(k);
            uop.data.mat4[i][j] = px;
            if (k.num == 0)
                uop.par.lin.zero |= SWS_MASK(i, j);
            else if (k.num == k.den)
                uop.par.lin.one |= SWS_MASK(i, j);
            else if (j < 4 && (!bitexact || exact_prod(uop.type, px, input, j)))
                exact |= SWS_MASK(i, j);
        }
    }

    if (flags & SWS_UOP_FLAG_FMA) {
        /* multiplication by 1 and 0 are always exact by definition */
        uop.uop = SWS_UOP_LINEAR_FMA;
        uop.par.lin.exact = exact | uop.par.lin.zero | uop.par.lin.one;
    }

    return ff_sws_uop_list_append(ops, &uop);
}

static bool is_expand_bit(SwsPixelType type, AVRational factor)
{
    if (factor.den != 1)
        return false;

    switch (type) {
    case SWS_PIXEL_U8:  return factor.num == UINT8_MAX;
    case SWS_PIXEL_U16: return factor.num == UINT16_MAX;
    case SWS_PIXEL_U32: return factor.num == UINT32_MAX;
    case SWS_PIXEL_F32: return false;
    case SWS_PIXEL_NONE:
    case SWS_PIXEL_TYPE_NB: break;
    }

    av_unreachable("Invalid pixel type!");
    return false;
}

static int translate_op(SwsContext *ctx, SwsUOpList *uops, SwsUOpFlags flags,
                        const SwsOp *op, const SwsComps *input)
{
    switch (op->op) {
    case SWS_OP_FILTER_H:
    case SWS_OP_FILTER_V:
        return AVERROR(ENOTSUP); /* always handled by subpass splitting */
    case SWS_OP_READ:
    case SWS_OP_WRITE:
        return translate_rw_op(ctx, uops, flags, op);
    case SWS_OP_SWIZZLE:
        return translate_swizzle(uops, flags, op);
    case SWS_OP_DITHER:
        return translate_dither_op(uops, op);
    case SWS_OP_LINEAR:
        return translate_linear_op(ctx, uops, flags, op, input);
    default:
        break;
    }

    /* Default handling for "simple" ops */
    SwsUOp uop = {
        .type = op->type,
        .uop  = SWS_UOP_INVALID,
        .mask = ff_sws_comp_mask_needed(op),
    };

    switch (op->op) {
    case SWS_OP_CONVERT:
        if (op->convert.expand) {
            av_assert0(op->type == SWS_PIXEL_U8);
            switch (op->convert.to) {
            case SWS_PIXEL_U16: uop.uop = SWS_UOP_EXPAND_PAIR; break;
            case SWS_PIXEL_U32: uop.uop = SWS_UOP_EXPAND_QUAD; break;
            }
        } else {
            switch (op->convert.to) {
            case SWS_PIXEL_U8:  uop.uop = SWS_UOP_TO_U8;  break;
            case SWS_PIXEL_U16: uop.uop = SWS_UOP_TO_U16; break;
            case SWS_PIXEL_U32: uop.uop = SWS_UOP_TO_U32; break;
            case SWS_PIXEL_F32: uop.uop = SWS_UOP_TO_F32; break;
            }
        }
        break;
    case SWS_OP_UNPACK:
    case SWS_OP_PACK:
        uop.uop = op->op == SWS_OP_PACK ? SWS_UOP_PACK : SWS_UOP_UNPACK;
        uop.mask = 0;
        for (int i = 0; i < 4 && op->pack.pattern[i]; i++) {
            uop.par.pack.pattern[i] = op->pack.pattern[i];
            uop.mask |= SWS_COMP(i);
        }
        break;
    case SWS_OP_LSHIFT:
    case SWS_OP_RSHIFT:
        uop.uop = op->op == SWS_OP_LSHIFT ? SWS_UOP_LSHIFT : SWS_UOP_RSHIFT;
        uop.par.shift.amount = op->shift.amount;
        break;
    case SWS_OP_CLEAR:
        uop.uop = SWS_UOP_CLEAR;
        uop.type = pixel_type_to_int(op->type);
        uop.mask &= op->clear.mask;
        for (int i = 0; i < 4; i++) {
            if (!SWS_COMP_TEST(op->clear.mask, i))
                continue;
            const AVRational v = op->clear.value[i];
            const SwsPixel px = Q2PIXEL(op->clear.value[i]);
            uop.data.vec4[i] = px;
            if (v.num == 0)
                uop.par.clear.zero |= SWS_COMP(i);
            else if (pixel_is_1s(op->type, px))
                uop.par.clear.one |= SWS_COMP(i);
        }
        break;
    case SWS_OP_SCALE:
        if (is_expand_bit(op->type, op->scale.factor)) {
            uop.uop = SWS_UOP_EXPAND_BIT;
        } else {
            uop.uop = SWS_UOP_SCALE;
            uop.data.scalar = Q2PIXEL(op->scale.factor);
        }
        break;
    case SWS_OP_MIN:
    case SWS_OP_MAX:
        uop.uop = op->op == SWS_OP_MIN ? SWS_UOP_MIN : SWS_UOP_MAX;
        uop.mask &= ff_sws_comp_mask_q4(op->clamp.limit);
        for (int i = 0; i < 4; i++) {
            if (SWS_COMP_TEST(uop.mask, i))
                uop.data.vec4[i] = Q2PIXEL(op->clamp.limit[i]);
        }
        break;
    case SWS_OP_SWAP_BYTES:
        uop.uop = SWS_UOP_SWAP_BYTES;
        uop.type = pixel_type_to_int(op->type);
        break;
    default:
        return AVERROR(ENOTSUP);
    }

    av_assert0(uop.uop != SWS_UOP_INVALID);
    return ff_sws_uop_list_append(uops, &uop);
}

int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops,
                         SwsUOpFlags flags, SwsUOpList *uops)
{
    SwsComps input = ops->comps_src;
    for (int i = 0; i < ops->num_ops; i++) {
        int ret = translate_op(ctx, uops, flags, &ops->ops[i], &input);
        if (ret < 0)
            return ret;
        input = ops->ops[i].comps;
    }
    return 0;
}

static int register_uop(struct AVTreeNode **root, const SwsUOp *uop)
{
    SwsUOp *key = av_memdup(uop, sizeof(*uop));
    if (!key)
        return AVERROR(ENOMEM);
    memset(&key->data, 0, sizeof(key->data));

    struct AVTreeNode *node = av_tree_node_alloc();
    if (!node) {
        av_free(key);
        return AVERROR(ENOMEM);
    }

    av_tree_insert(root, key, ff_sws_uop_cmp_v, &node);
    if (node) {
        av_free(node);
        av_free(key);
    }
    return 0;
}

static int register_flags(SwsContext *ctx, const SwsOpList *ops, SwsUOpFlags flags)
{
    SwsUOpList *uops = ff_sws_uop_list_alloc();
    if (!uops)
        return AVERROR(ENOMEM);

    int ret = ff_sws_ops_translate(ctx, ops, flags, uops);
    if (ret < 0)
        goto fail;

    struct AVTreeNode **root = ctx->opaque;
    for (int i = 0; i < uops->num_ops; i++) {
        ret = register_uop(root, &uops->ops[i]);
        if (ret < 0)
            goto fail;
    }

fail:
    ff_sws_uop_list_free(&uops);
    return ret;
}

static const SwsUOpFlags uop_flags[] = {
    0,
    SWS_UOP_FLAG_FMA | SWS_UOP_FLAG_MOVE, /* x86 backend */
};

static int register_uops(SwsContext *ctx, const SwsOpList *ops,
                         SwsCompiledOp *out)
{
    for (int i = 0; i < FF_ARRAY_ELEMS(uop_flags); i++) {
        int ret = register_flags(ctx, ops, uop_flags[i]);
        if (ret < 0)
            return ret;
    }

    *out = (SwsCompiledOp) {0}; /* dummy value, will be immediately freed */
    return 0;
}

/* Dummy backend that just registers all seen uops */
static const SwsOpBackend backend_uops = {
    .name    = "uops_gen",
    .compile = register_uops,
};

static int register_all_uops(SwsContext *ctx, void *graph, SwsOpList *ops)
{
    /* ff_sws_compile_pass() takes over ownership of `ops` */
    SwsOpList *copy = ff_sws_op_list_duplicate(ops);
    if (!copy)
        return AVERROR(ENOMEM);

    return ff_sws_compile_pass(graph, &backend_uops, &copy, 0, NULL, NULL);
}

static const SwsFlags flags[] = {
    0,
    SWS_ACCURATE_RND,   /* may insert extra 1x1 dither ops (for accurate rounding) */
    SWS_BITEXACT,       /* prevents some FMA optimizations */
    SWS_ACCURATE_RND | SWS_BITEXACT,
};

/* Limit the range of av_tree_enumerate() to only matching uop and type */
static int enum_type(void *opaque, void *elem)
{
    const SwsUOp *a = opaque, *b = elem;
    if (a->type != b->type)
        return (int) b->type - a->type;
    if (a->uop != b->uop)
        return (int) b->uop - a->uop;
    return 0;
}

static int free_uop_key(void *opaque, void *key)
{
    av_free(key);
    return 0;
}

int ff_sws_uops_macros_gen(char **out_str)
{
    int ret;
    struct AVTreeNode *root = NULL;

    AVBPrint bprint, *const bp = &bprint;
    av_bprint_init(bp, 0, AV_BPRINT_SIZE_UNLIMITED);

    /* Allocate dummy graph and context for ff_sws_compile_pass() */
    SwsGraph *graph = ff_sws_graph_alloc();
    if (!graph)
        return AVERROR(ENOMEM);

    SwsContext *ctx = graph->ctx = sws_alloc_context();
    if (!ctx) {
        ret = AVERROR(ENOMEM);
        goto fail;
    }

    /* Use this to plumb the tree state through all the layers of abstraction */
    ctx->opaque = &root;
    ctx->scaler = SWS_SCALE_BILINEAR; /* cheaper to generate filter kernels */

    /* Register all unique uops over every relevant combination of flags */
    for (int i = 0; i < FF_ARRAY_ELEMS(flags); i++) {
        ctx->flags = flags[i];
        ret = ff_sws_enum_op_lists(ctx, graph, AV_PIX_FMT_NONE, AV_PIX_FMT_NONE,
                                   register_all_uops);
        if (ret < 0)
            goto fail;
    }

    /**
     * Additionally make sure planar reads/writes are always available for all
     * formats, because checkasm depends on them to be able to verify the
     * input/output of any other operations.
     */
    for (enum SwsPixelType type = SWS_PIXEL_NONE+1; type < SWS_PIXEL_TYPE_NB; type++) {
        if (!ff_sws_pixel_type_is_int(type))
            continue;
        for (int elems = 1; elems <= 4; elems++) {
            for (int rw = 0; rw < 2; rw++) {
                SwsUOp uop = {
                    .type = type,
                    .uop  = rw ? SWS_UOP_WRITE_PLANAR : SWS_UOP_READ_PLANAR,
                    .mask = SWS_COMP_ELEMS(elems),
                };

                ret = register_uop(&root, &uop);
                if (ret < 0)
                    goto fail;
            }
        }
    }

    #define BPRINT_STR(str) av_bprint_append_data(bp, str, strlen(str))
    BPRINT_STR(
"/**\n"
" * This file is automatically generated. Do not edit manually.\n"
" * To regenerate, run: make fate-sws-uops-macros GEN=1\n"
" */\n"
"\n"
"#ifndef SWSCALE_UOPS_MACROS_H\n"
"#define SWSCALE_UOPS_MACROS_H\n"
"\n"
"/**\n"
" * Boilerplate helper macros, for template-based backends. These will be\n"
" * instantiated like this, with parameters in struct order:\n"
" *   MACRO(__VA_ARGS__, NAME, UOP, TYPE, MASK, [PARAMS,])\n"
" * The _STRUCT variants pass all arguments in C struct syntax, while the\n"
" * plain variants give them as separate C values (e.g. for use in calls)\n"
" */\n"
"#define SWS_GLUE3(x, y, z) x ## _ ## y ## _ ## z\n"
"#define SWS_FOR(TYPE, UOP, MACRO, ...) \\\n"
"    SWS_GLUE3(SWS_FOR, TYPE, UOP)(MACRO, __VA_ARGS__)\n"
"#define SWS_FOR_STRUCT(TYPE, UOP, MACRO, ...) \\\n"
"    SWS_GLUE3(SWS_FOR_STRUCT, TYPE, UOP)(MACRO, __VA_ARGS__)\n"
"\n");

    SwsUOp key = { .data.opaque = bp };
    for (key.type = SWS_PIXEL_NONE + 1; key.type < SWS_PIXEL_TYPE_NB; key.type++) {
        for (key.uop = SWS_UOP_INVALID + 1; key.uop < SWS_UOP_TYPE_NB; key.uop++) {
            const char *macro  = uop_names[key.uop].macro;
            const char *prefix = pixel_types[key.type].prefix;
            av_bprintf(bp, "#define SWS_FOR_%s%s(MACRO, ...)", prefix, macro);
            av_tree_enumerate(root, &key, enum_type, generate_entry_args);
            av_bprintf(bp, "\n");
            av_bprintf(bp, "#define SWS_FOR_STRUCT_%s%s(MACRO, ...)", prefix, macro);
            av_tree_enumerate(root, &key, enum_type, generate_entry_struct);
            av_bprintf(bp, "\n");
        }
    }

    BPRINT_STR("\n#endif /* SWSCALE_UOPS_MACROS_H */");
    ret = av_bprint_finalize(bp, out_str);

fail:
    av_bprint_finalize(bp, NULL);
    av_tree_enumerate(root, NULL, NULL, free_uop_key);
    av_tree_destroy(root);
    ff_sws_graph_free(&graph);
    sws_free_context(&ctx);
    return ret;
}