mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-06-11 08:13:06 +00:00
This decomposes a swizzle mask into a series of optimal register-register moves, using at most two temporary scratch registers. This is a better match for ASM-style backends than the existing PERMUTE/COPY uops that are designed for the needs of the C backend (or other backends which either apply the swizzle mask directly or permute pointers). I originally had logic equivalent to this written in NASM macros, but it was just such a complicated mess that I think it's better to rewrite it in C and have the resulting metadata be an explicit part of the uop definition. This commit only adds the uop, I'll update the x86 implementation in the next step. Co-authored-by: Ramiro Polla <ramiro.polla@gmail.com> Signed-off-by: Niklas Haas <git@haasn.dev>
268 lines
9.0 KiB
C
268 lines
9.0 KiB
C
/**
|
|
* Copyright (C) 2025 Niklas Haas
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#ifndef SWSCALE_UOPS_H
|
|
#define SWSCALE_UOPS_H
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
|
|
/***************************************************************************
|
|
* Note: This header must be usable at build time, to generate asm sources *
|
|
***************************************************************************/
|
|
|
|
#include "libavutil/attributes.h"
|
|
|
|
typedef struct SwsContext SwsContext;
|
|
typedef struct SwsFilterWeights SwsFilterWeights;
|
|
typedef struct SwsOpList SwsOpList;
|
|
|
|
typedef enum SwsPixelType {
|
|
SWS_PIXEL_NONE = 0,
|
|
SWS_PIXEL_U8,
|
|
SWS_PIXEL_U16,
|
|
SWS_PIXEL_U32,
|
|
SWS_PIXEL_F32,
|
|
SWS_PIXEL_TYPE_NB
|
|
} SwsPixelType;
|
|
|
|
const char *ff_sws_pixel_type_name(SwsPixelType type);
|
|
int ff_sws_pixel_type_size(SwsPixelType type) av_const;
|
|
bool ff_sws_pixel_type_is_int(SwsPixelType type) av_const;
|
|
|
|
typedef union SwsPixel {
|
|
char data[4];
|
|
|
|
uint8_t u8;
|
|
uint16_t u16;
|
|
uint32_t u32;
|
|
float f32;
|
|
} SwsPixel;
|
|
|
|
/* Ensures (SwsPixel) {0} is properly initialized to all zeros */
|
|
static_assert(sizeof(SwsPixel) == sizeof(char[4]), "SwsPixel size mismatch");
|
|
|
|
/**
|
|
* Bit-mask of components. Exact meaning depends on the usage context.
|
|
*/
|
|
typedef uint8_t SwsCompMask;
|
|
enum {
|
|
SWS_COMP_NONE = 0,
|
|
SWS_COMP_ALL = 0xF,
|
|
#define SWS_COMP(X) (1 << (X))
|
|
#define SWS_COMP_TEST(mask, X) (!!((mask) & SWS_COMP(X)))
|
|
#define SWS_COMP_INV(mask) ((mask) ^ SWS_COMP_ALL)
|
|
#define SWS_COMP_ELEMS(N) ((1 << (N)) - 1)
|
|
#define SWS_COMP_MASK(X, Y, Z, W) \
|
|
(((X) ? SWS_COMP(0) : 0) | \
|
|
((Y) ? SWS_COMP(1) : 0) | \
|
|
((Z) ? SWS_COMP(2) : 0) | \
|
|
((W) ? SWS_COMP(3) : 0))
|
|
};
|
|
|
|
typedef uint32_t SwsUOpFlags;
|
|
typedef enum SwsUOpFlagBits {
|
|
SWS_UOP_FLAG_NONE = 0,
|
|
SWS_UOP_FLAG_FMA = (1 << 0), /* platform supports FMA ops */
|
|
SWS_UOP_FLAG_MOVE = (1 << 1), /* platform supports SWS_UOP_MOVE */
|
|
} SwsUOpFlagBits;
|
|
|
|
typedef enum SwsUOpType {
|
|
SWS_UOP_INVALID = 0,
|
|
|
|
/* Read/write uops; mask = components to read/write */
|
|
SWS_UOP_READ_PLANAR, /* simple planar byte-aligned read */
|
|
SWS_UOP_READ_PLANAR_FH, /* planar read with horizontal filter */
|
|
SWS_UOP_READ_PLANAR_FV, /* planar read with vertical filter */
|
|
SWS_UOP_READ_PLANAR_FV_FMA,
|
|
SWS_UOP_READ_PACKED, /* simple packed byte-aligned read */
|
|
SWS_UOP_READ_NIBBLE, /* fractional read (4 bits) from single plane */
|
|
SWS_UOP_READ_BIT, /* fractional read (1 bit) from single plane */
|
|
|
|
SWS_UOP_WRITE_PLANAR, /* simple planar byte-aligned write */
|
|
SWS_UOP_WRITE_PACKED, /* simple packed byte-aligned write */
|
|
SWS_UOP_WRITE_NIBBLE, /* fractional write (4 bits) to single plane */
|
|
SWS_UOP_WRITE_BIT, /* fractional write (1 bit) to single plane */
|
|
|
|
/* Data rearrangement uops; mask = non-trivial and needed components */
|
|
SWS_UOP_PERMUTE, /* rearrange components (no duplicates) */
|
|
SWS_UOP_COPY, /* copy/duplicate components */
|
|
SWS_UOP_MOVE, /* series of register-register assignments */
|
|
|
|
/* Data conversion / manipulation uops; mask = affected components */
|
|
SWS_UOP_SWAP_BYTES, /* swap byte order in components */
|
|
SWS_UOP_EXPAND_BIT, /* expand low-order bit to all bits in type */
|
|
SWS_UOP_EXPAND_PAIR, /* expand bytes in pairs (16 bit) */
|
|
SWS_UOP_EXPAND_QUAD, /* expand bytes in quads (32 bit) */
|
|
SWS_UOP_TO_U8, /* cast pixel values to SWS_PIXEL_U8 */
|
|
SWS_UOP_TO_U16, /* cast pixel values to SWS_PIXEL_U16 */
|
|
SWS_UOP_TO_U32, /* cast pixel values to SWS_PIXEL_U32 */
|
|
SWS_UOP_TO_F32, /* cast pixel values to SWS_PIXEL_F32 */
|
|
|
|
/* Arithmetic uops */
|
|
SWS_UOP_SCALE, /* multiply masked components by scalar */
|
|
SWS_UOP_ADD, /* add vec4 to masked components */
|
|
SWS_UOP_MIN, /* min(x, vec4) on masked components */
|
|
SWS_UOP_MAX, /* max(x, vec4) on masked components */
|
|
|
|
/* Identical to corresponding SwsOpType */
|
|
SWS_UOP_UNPACK, /* mask = nonzero components in pack pattern */
|
|
SWS_UOP_PACK, /* mask = nonzero components in pack pattern */
|
|
SWS_UOP_LSHIFT, /* mask = components to shift */
|
|
SWS_UOP_RSHIFT, /* mask = components to shift */
|
|
SWS_UOP_CLEAR, /* mask = components to clear */
|
|
SWS_UOP_LINEAR, /* mask = non-trivial output rows */
|
|
SWS_UOP_LINEAR_FMA, /* with SWS_UOP_FLAG_FMA */
|
|
SWS_UOP_DITHER, /* mask = components to dither */
|
|
|
|
/* Platform-specific uops would go here */
|
|
SWS_UOP_TYPE_NB,
|
|
} SwsUOpType;
|
|
|
|
typedef struct SwsFilterUOp {
|
|
SwsPixelType type; /* pixel type to store result as */
|
|
} SwsFilterUOp;
|
|
|
|
typedef struct SwsShiftUOp {
|
|
uint8_t amount;
|
|
} SwsShiftUOp;
|
|
|
|
typedef struct SwsSwizzleUOp {
|
|
uint8_t in[4]; /* input component for each output component */
|
|
} SwsSwizzleUOp;
|
|
|
|
typedef struct SwsMoveUOp {
|
|
/* The worst case number of moves (for two independent cycles) */
|
|
#define SWS_UOP_MOVE_MAX 6
|
|
int num_moves;
|
|
|
|
/* This may involve a temporary register (index -1) */
|
|
int8_t dst[SWS_UOP_MOVE_MAX]; /* destination register index */
|
|
int8_t src[SWS_UOP_MOVE_MAX]; /* source register index */
|
|
} SwsMoveUOp;
|
|
|
|
typedef struct SwsPackUOp {
|
|
uint8_t pattern[4]; /* bit depth pattern, from MSB to LSB */
|
|
} SwsPackUOp;
|
|
|
|
typedef struct SwsClearUOp {
|
|
SwsCompMask one; /* mask of coefficients equal to all 1s */
|
|
SwsCompMask zero; /* mask of coefficients equal to all 0s */
|
|
} SwsClearUOp;
|
|
|
|
typedef struct SwsLinearUOp {
|
|
uint32_t one; /* mask of coefficients equal to one */
|
|
uint32_t zero; /* mask of coefficients equal to zero */
|
|
|
|
/* for SWS_UOP_LINEAR_FMA only */
|
|
uint32_t exact; /* mask of coefficients whose product is exact */
|
|
} SwsLinearUOp;
|
|
|
|
typedef struct SwsDitherUOp {
|
|
uint8_t y_offset[4];
|
|
uint8_t size_log2;
|
|
} SwsDitherUOp;
|
|
|
|
/**
|
|
* Computes (1 << size_log2) + MAX(y_offset). The dither matrix attached to
|
|
* the SwsUOp is always pre-padded to this number of lines.
|
|
*/
|
|
int ff_sws_dither_height(const SwsDitherUOp *dither);
|
|
|
|
typedef union SwsUOpParams {
|
|
SwsFilterUOp filter; /* for SWS_UOP_READ_*_FV/FH */
|
|
SwsShiftUOp shift;
|
|
SwsSwizzleUOp swizzle;
|
|
SwsMoveUOp move;
|
|
SwsPackUOp pack;
|
|
SwsClearUOp clear;
|
|
SwsLinearUOp lin;
|
|
SwsDitherUOp dither;
|
|
} SwsUOpParams;
|
|
|
|
typedef struct SwsUOp {
|
|
/* These fields uniquely identify the uop implementation */
|
|
SwsPixelType type;
|
|
SwsUOpType uop;
|
|
SwsCompMask mask;
|
|
SwsUOpParams par;
|
|
|
|
/* Constant data for this uop; not part of the unique identifier */
|
|
union {
|
|
SwsFilterWeights *kernel; /* refstruct */
|
|
SwsPixel *ptr; /* refstruct */
|
|
SwsPixel scalar;
|
|
SwsPixel vec4[4];
|
|
SwsPixel mat4[4][5]; /* row major */
|
|
void *opaque; /* reserved for internal use */
|
|
} data;
|
|
} SwsUOp;
|
|
|
|
/**
|
|
* Compare two SwsUOps for equality (excluding constant data).
|
|
*/
|
|
int ff_sws_uop_cmp(const SwsUOp *a, const SwsUOp *b);
|
|
|
|
static inline int ff_sws_uop_cmp_v(const void *a, const void *b)
|
|
{
|
|
return ff_sws_uop_cmp(a, b);
|
|
}
|
|
|
|
/**
|
|
* Generate a unique name for a SwsUOp.
|
|
*/
|
|
#define SWS_UOP_NAME_MAX 64
|
|
void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX]);
|
|
|
|
typedef struct SwsUOpList {
|
|
SwsUOp *ops;
|
|
int num_ops;
|
|
} SwsUOpList;
|
|
|
|
SwsUOpList *ff_sws_uop_list_alloc(void);
|
|
void ff_sws_uop_list_free(SwsUOpList **ops);
|
|
|
|
/* Takes over ownership of `uop` and sets it to {0}, even on failure. */
|
|
int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop);
|
|
|
|
/**
|
|
* Translate a list of operations down to micro-ops, which can be further
|
|
* optimized and then directly executed by backends.
|
|
*
|
|
* Return 0 or a negative error code.
|
|
*/
|
|
int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops,
|
|
SwsUOpFlags flags, SwsUOpList *uops);
|
|
|
|
/**
|
|
* Generate a set of boilerplate C preprocessor macros for describing and
|
|
* programmatically iterating over all possible SwsUOps.
|
|
*
|
|
* This function can be quite slow as it iterates over every possible
|
|
* combination of pixel formats and flags.
|
|
*
|
|
* Returns 0 or a negative error code. On success, an allocated string is
|
|
* returned via `out_str`, and must be av_free()'d by the caller.
|
|
*/
|
|
int ff_sws_uops_macros_gen(char **out_str);
|
|
|
|
#endif
|