ffmpeg/libswscale/uops.h

/**
 * Copyright (C) 2025 Niklas Haas
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef SWSCALE_UOPS_H
#define SWSCALE_UOPS_H

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>

/***************************************************************************
 * Note: This header must be usable at build time, to generate asm sources *
 ***************************************************************************/

#include "libavutil/attributes.h"

typedef struct SwsContext       SwsContext;
typedef struct SwsFilterWeights SwsFilterWeights;
typedef struct SwsOpList        SwsOpList;

typedef enum SwsPixelType {
    SWS_PIXEL_NONE = 0,
    SWS_PIXEL_U8,
    SWS_PIXEL_U16,
    SWS_PIXEL_U32,
    SWS_PIXEL_F32,
    SWS_PIXEL_TYPE_NB
} SwsPixelType;

const char *ff_sws_pixel_type_name(SwsPixelType type);
int ff_sws_pixel_type_size(SwsPixelType type) av_const;
bool ff_sws_pixel_type_is_int(SwsPixelType type) av_const;

typedef union SwsPixel {
    char data[4];

    uint8_t  u8;
    uint16_t u16;
    uint32_t u32;
    float    f32;
} SwsPixel;

/* Ensures (SwsPixel) {0} is properly initialized to all zeros */
static_assert(sizeof(SwsPixel) == sizeof(char[4]), "SwsPixel size mismatch");

/**
 * Bit-mask of components. Exact meaning depends on the usage context.
 */
typedef uint8_t SwsCompMask;
enum {
    SWS_COMP_NONE = 0,
    SWS_COMP_ALL  = 0xF,
#define SWS_COMP(X) (1 << (X))
#define SWS_COMP_TEST(mask, X) (!!((mask) & SWS_COMP(X)))
#define SWS_COMP_INV(mask) ((mask) ^ SWS_COMP_ALL)
#define SWS_COMP_ELEMS(N) ((1 << (N)) - 1)
#define SWS_COMP_MASK(X, Y, Z, W)   \
    (((X) ? SWS_COMP(0) : 0) |      \
     ((Y) ? SWS_COMP(1) : 0) |      \
     ((Z) ? SWS_COMP(2) : 0) |      \
     ((W) ? SWS_COMP(3) : 0))
};

typedef uint32_t SwsUOpFlags;
typedef enum SwsUOpFlagBits {
    SWS_UOP_FLAG_NONE = 0,
    SWS_UOP_FLAG_FMA  = (1 << 0), /* platform supports FMA ops */
    SWS_UOP_FLAG_MOVE = (1 << 1), /* platform supports SWS_UOP_MOVE */
} SwsUOpFlagBits;

typedef enum SwsUOpType {
    SWS_UOP_INVALID = 0,

    /* Read/write uops; mask = components to read/write */
    SWS_UOP_READ_PLANAR,     /* simple planar byte-aligned read */
    SWS_UOP_READ_PLANAR_FH,  /* planar read with horizontal filter */
    SWS_UOP_READ_PLANAR_FV,  /* planar read with vertical filter */
    SWS_UOP_READ_PLANAR_FV_FMA,
    SWS_UOP_READ_PACKED,     /* simple packed byte-aligned read */
    SWS_UOP_READ_NIBBLE,     /* fractional read (4 bits) from single plane */
    SWS_UOP_READ_BIT,        /* fractional read (1 bit) from single plane */

    SWS_UOP_WRITE_PLANAR,    /* simple planar byte-aligned write */
    SWS_UOP_WRITE_PACKED,    /* simple packed byte-aligned write */
    SWS_UOP_WRITE_NIBBLE,    /* fractional write (4 bits) to single plane */
    SWS_UOP_WRITE_BIT,       /* fractional write (1 bit) to single plane */

    /* Data rearrangement uops; mask = non-trivial and needed components */
    SWS_UOP_PERMUTE,         /* rearrange components (no duplicates) */
    SWS_UOP_COPY,            /* copy/duplicate components */
    SWS_UOP_MOVE,            /* series of register-register assignments */

    /* Data conversion / manipulation uops; mask = affected components */
    SWS_UOP_SWAP_BYTES,      /* swap byte order in components */
    SWS_UOP_EXPAND_BIT,      /* expand low-order bit to all bits in type */
    SWS_UOP_EXPAND_PAIR,     /* expand bytes in pairs (16 bit) */
    SWS_UOP_EXPAND_QUAD,     /* expand bytes in quads (32 bit) */
    SWS_UOP_TO_U8,           /* cast pixel values to SWS_PIXEL_U8  */
    SWS_UOP_TO_U16,          /* cast pixel values to SWS_PIXEL_U16 */
    SWS_UOP_TO_U32,          /* cast pixel values to SWS_PIXEL_U32 */
    SWS_UOP_TO_F32,          /* cast pixel values to SWS_PIXEL_F32 */

    /* Arithmetic uops */
    SWS_UOP_SCALE,           /* multiply masked components by scalar */
    SWS_UOP_ADD,             /* add vec4 to masked components */
    SWS_UOP_MIN,             /* min(x, vec4) on masked components */
    SWS_UOP_MAX,             /* max(x, vec4) on masked components */

    /* Identical to corresponding SwsOpType */
    SWS_UOP_UNPACK,          /* mask = nonzero components in pack pattern */
    SWS_UOP_PACK,            /* mask = nonzero components in pack pattern */
    SWS_UOP_LSHIFT,          /* mask = components to shift */
    SWS_UOP_RSHIFT,          /* mask = components to shift */
    SWS_UOP_CLEAR,           /* mask = components to clear */
    SWS_UOP_LINEAR,          /* mask = non-trivial output rows */
    SWS_UOP_LINEAR_FMA,      /* with SWS_UOP_FLAG_FMA */
    SWS_UOP_DITHER,          /* mask = components to dither */

    /* Platform-specific uops would go here */
    SWS_UOP_TYPE_NB,
} SwsUOpType;

typedef struct SwsFilterUOp {
    SwsPixelType type; /* pixel type to store result as */
} SwsFilterUOp;

typedef struct SwsShiftUOp {
    uint8_t amount;
} SwsShiftUOp;

typedef struct SwsSwizzleUOp {
    uint8_t in[4]; /* input component for each output component */
} SwsSwizzleUOp;

typedef struct SwsMoveUOp {
    /* The worst case number of moves (for two independent cycles) */
    #define SWS_UOP_MOVE_MAX 6
    int num_moves;

    /* This may involve a temporary register (index -1) */
    int8_t dst[SWS_UOP_MOVE_MAX]; /* destination register index */
    int8_t src[SWS_UOP_MOVE_MAX]; /* source register index */
} SwsMoveUOp;

typedef struct SwsPackUOp {
    uint8_t pattern[4]; /* bit depth pattern, from MSB to LSB */
} SwsPackUOp;

typedef struct SwsClearUOp {
    SwsCompMask one;  /* mask of coefficients equal to all 1s */
    SwsCompMask zero; /* mask of coefficients equal to all 0s */
} SwsClearUOp;

typedef struct SwsLinearUOp {
    uint32_t one;  /* mask of coefficients equal to one */
    uint32_t zero; /* mask of coefficients equal to zero */

    /* for SWS_UOP_LINEAR_FMA only */
    uint32_t exact; /* mask of coefficients whose product is exact */
} SwsLinearUOp;

typedef struct SwsDitherUOp {
    uint8_t y_offset[4];
    uint8_t size_log2;
} SwsDitherUOp;

/**
 * Computes (1 << size_log2) + MAX(y_offset). The dither matrix attached to
 * the SwsUOp is always pre-padded to this number of lines.
 */
int ff_sws_dither_height(const SwsDitherUOp *dither);

typedef union SwsUOpParams {
    SwsFilterUOp    filter; /* for SWS_UOP_READ_*_FV/FH */
    SwsShiftUOp     shift;
    SwsSwizzleUOp   swizzle;
    SwsMoveUOp      move;
    SwsPackUOp      pack;
    SwsClearUOp     clear;
    SwsLinearUOp    lin;
    SwsDitherUOp    dither;
} SwsUOpParams;

typedef struct SwsUOp {
    /* These fields uniquely identify the uop implementation */
    SwsPixelType type;
    SwsUOpType uop;
    SwsCompMask mask;
    SwsUOpParams par;

    /* Constant data for this uop; not part of the unique identifier */
    union {
        SwsFilterWeights *kernel;   /* refstruct */
        SwsPixel *ptr;              /* refstruct */
        SwsPixel scalar;
        SwsPixel vec4[4];
        SwsPixel mat4[4][5];        /* row major */
        void *opaque;               /* reserved for internal use */
    } data;
} SwsUOp;

/**
 * Compare two SwsUOps for equality (excluding constant data).
 */
int ff_sws_uop_cmp(const SwsUOp *a, const SwsUOp *b);

static inline int ff_sws_uop_cmp_v(const void *a, const void *b)
{
    return ff_sws_uop_cmp(a, b);
}

/**
 * Generate a unique name for a SwsUOp.
 */
#define SWS_UOP_NAME_MAX 64
void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX]);

typedef struct SwsUOpList {
    SwsUOp *ops;
    int num_ops;
} SwsUOpList;

SwsUOpList *ff_sws_uop_list_alloc(void);
void ff_sws_uop_list_free(SwsUOpList **ops);

/* Takes over ownership of `uop` and sets it to {0}, even on failure. */
int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop);

/**
 * Translate a list of operations down to micro-ops, which can be further
 * optimized and then directly executed by backends.
 *
 * Return 0 or a negative error code.
 */
int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops,
                         SwsUOpFlags flags, SwsUOpList *uops);

/**
 * Generate a set of boilerplate C preprocessor macros for describing and
 * programmatically iterating over all possible SwsUOps.
 *
 * This function can be quite slow as it iterates over every possible
 * combination of pixel formats and flags.
 *
 * Returns 0 or a negative error code. On success, an allocated string is
 * returned via `out_str`, and must be av_free()'d by the caller.
 */
int ff_sws_uops_macros_gen(char **out_str);

#endif