Files
ffmpeg/libswscale/aarch64/swscale_unscaled.c
T
DROOdotFOOandRamiro Polla cc7c567920 swscale/aarch64/yuv2rgb_neon: add BE 16bpp output formats
BE counterparts to the LE paths in 2e142e52ae; pack adds rev16 before
store. nv12/nv21 paths are added but bench-only (no C ref, same as
2e142e52ae).

Test Name                              A55-gcc           M1-clang             A76-gcc
-------------------------------------------------------------------------------------
yuv420p_rgb565be_1920_neon    15086.1 ( 3.91x)    5507.0 ( 4.34x)    19229.1 ( 2.02x)
yuv420p_bgr565be_1920_neon    15291.7 ( 3.84x)    5476.9 ( 4.37x)    19229.4 ( 2.02x)
yuv420p_rgb555be_1920_neon    15091.5 ( 3.67x)    5569.0 ( 3.97x)    19229.3 ( 1.90x)
yuv420p_bgr555be_1920_neon    15298.6 ( 3.62x)    5600.6 ( 3.98x)    19228.8 ( 1.90x)
yuv422p_rgb565be_1920_neon    16862.3 ( 4.00x)    6378.8 ( 4.64x)    22110.3 ( 2.07x)
yuv422p_bgr565be_1920_neon    17139.3 ( 3.93x)    6448.1 ( 4.50x)    22104.1 ( 2.07x)
yuv422p_rgb555be_1920_neon    16853.3 ( 3.98x)    6468.8 ( 4.12x)    22106.4 ( 1.98x)
yuv422p_bgr555be_1920_neon    17202.2 ( 3.89x)    6467.0 ( 4.12x)    22110.2 ( 1.98x)
yuva420p_rgb565be_1920_neon   15050.2 ( 3.92x)    5452.5 ( 4.39x)    19229.5 ( 2.02x)
yuva420p_bgr565be_1920_neon   15346.6 ( 3.84x)    5462.4 ( 4.36x)    19228.9 ( 2.02x)
yuva420p_rgb555be_1920_neon   15050.8 ( 3.69x)    5463.3 ( 3.95x)    19228.6 ( 1.90x)
yuva420p_bgr555be_1920_neon   15352.8 ( 3.61x)    5543.6 ( 3.89x)    19228.6 ( 1.90x)

Co-authored-by: Ramiro Polla <ramiro.polla@gmail.com>
Signed-off-by: DROOdotFOO <drew@axol.io>
2026-06-10 17:54:20 +00:00

283 lines
18 KiB
C

/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/aarch64/cpu.h"
#define YUV_TO_RGB_TABLE \
c->yuv2rgb_v2r_coeff, \
c->yuv2rgb_u2g_coeff, \
c->yuv2rgb_v2g_coeff, \
c->yuv2rgb_u2b_coeff, \
#define DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
int y_offset, \
int y_coeff, \
const int16_t *table, \
const uint8_t *const src[], const int srcStride[], \
uint8_t *dst, int linesize); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \
const int srcStride[], int srcSliceY, \
int srcSliceH, uint8_t *const dst[], \
const int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff, \
yuv2rgb_table, \
src, srcStride, \
dst[0] + srcSliceY * dstStride[0], \
dstStride[0]); \
} \
#define DECLARE_FF_YUVX_TO_PLANAR_RGB_FUNCS(ifmt, ofmt) \
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
int y_offset, \
int y_coeff, \
const int16_t *table, \
const uint8_t *const src[], const int srcStride[], \
uint8_t *dst0, int linesize0, \
uint8_t *dst1, int linesize1, \
uint8_t *dst2, int linesize2); \
\
static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \
const int srcStride[], int srcSliceY, \
int srcSliceH, uint8_t *const dst[], \
const int dstStride[]) { \
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
\
return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
c->yuv2rgb_y_offset >> 6, \
c->yuv2rgb_y_coeff, \
yuv2rgb_table, \
src, srcStride, \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
dst[1] + srcSliceY * dstStride[1], dstStride[1], \
dst[2] + srcSliceY * dstStride[2], dstStride[2]); \
} \
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, argb) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgba) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, abgr) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgra) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb24) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr24) \
DECLARE_FF_YUVX_TO_PLANAR_RGB_FUNCS(yuvx, gbrp) \
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(nv12)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(nv21)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
#define DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuvx) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb565le) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr565le) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb555le) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr555le) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb565be) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr565be) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb555be) \
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr555be) \
DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(nv12)
DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(nv21)
DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv422p)
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, argb)
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, rgba)
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, abgr)
DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, bgra)
void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
uint8_t *dst2, int dstStride2,
const uint8_t *src, int srcStride,
int w, int h);
static int nv24_to_yuv420p_neon_wrapper(SwsInternal *c, const uint8_t *const src[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[])
{
uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->opts.src_w,
dst[0], dstStride[0]);
if (c->opts.src_format == AV_PIX_FMT_NV24)
ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
src[1], srcStride[1], c->opts.src_w / 2,
srcSliceH);
else
ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
src[1], srcStride[1], c->opts.src_w / 2,
srcSliceH);
return srcSliceH;
}
/* We need a 16 pixel width alignment. This constraint can easily be removed
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
* line, which won't fit the 32-bytes buffer alignment. */
#define SET_FF_YUVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
if (c->opts.src_format == AV_PIX_FMT_##IFMT \
&& c->opts.dst_format == AV_PIX_FMT_##OFMT \
&& !(c->opts.src_h & 1) \
&& !(c->opts.src_w & 15) \
&& !accurate_rnd) \
c->convert_unscaled = ifmt##_to_##ofmt##_neon_wrapper; \
} while (0)
#define SET_FF_YUVX_TO_ALL_RGBX_FUNC(yuvx, YUVX, accurate_rnd) do { \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, argb, ARGB, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgba, RGBA, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, abgr, ABGR, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgra, BGRA, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, gbrp, GBRP, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb24, RGB24, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr24, BGR24, accurate_rnd); \
} while (0)
#define SET_FF_YUVX_TO_ALL_RGB16_FUNC(yuvx, YUVX, accurate_rnd) do { \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb565le, RGB565LE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr565le, BGR565LE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb555le, RGB555LE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr555le, BGR555LE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb565be, RGB565BE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr565be, BGR565BE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb555be, RGB555BE, accurate_rnd); \
SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr555be, BGR555BE, accurate_rnd); \
} while (0)
static void get_unscaled_swscale_neon(SwsInternal *c) {
int accurate_rnd = c->opts.flags & SWS_ACCURATE_RND;
SET_FF_YUVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGB16_FUNC(nv12, NV12, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGB16_FUNC(nv21, NV21, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGB16_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_YUVX_TO_ALL_RGB16_FUNC(yuv422p, YUV422P, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd);
/* yuva420p -> 16bpp: alpha is dropped, route through yuv420p NEON path */
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565le, RGB565LE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565le, BGR565LE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555le, RGB555LE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555le, BGR555LE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565be, RGB565BE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565be, BGR565BE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555be, RGB555BE, accurate_rnd);
SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555be, BGR555BE, accurate_rnd);
if (c->opts.dst_format == AV_PIX_FMT_YUV420P &&
(c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) &&
!(c->opts.src_h & 1) && !(c->opts.src_w & 15) && !accurate_rnd)
c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
}
void ff_get_unscaled_swscale_aarch64(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}
av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
if (!have_neon(cpu_flags) ||
(c->opts.src_h & 1) || (c->opts.src_w & 15) ||
(c->opts.flags & SWS_ACCURATE_RND))
return NULL;
if (c->opts.src_format == AV_PIX_FMT_YUV420P) {
switch (c->opts.dst_format) {
case AV_PIX_FMT_ARGB: return yuv420p_to_argb_neon_wrapper;
case AV_PIX_FMT_RGBA: return yuv420p_to_rgba_neon_wrapper;
case AV_PIX_FMT_ABGR: return yuv420p_to_abgr_neon_wrapper;
case AV_PIX_FMT_BGRA: return yuv420p_to_bgra_neon_wrapper;
case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
case AV_PIX_FMT_RGB565LE: return yuv420p_to_rgb565le_neon_wrapper;
case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper;
case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper;
case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper;
case AV_PIX_FMT_RGB565BE: return yuv420p_to_rgb565be_neon_wrapper;
case AV_PIX_FMT_BGR565BE: return yuv420p_to_bgr565be_neon_wrapper;
case AV_PIX_FMT_RGB555BE: return yuv420p_to_rgb555be_neon_wrapper;
case AV_PIX_FMT_BGR555BE: return yuv420p_to_bgr555be_neon_wrapper;
}
} else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
switch (c->opts.dst_format) {
#if CONFIG_SWSCALE_ALPHA
case AV_PIX_FMT_ARGB: return yuva420p_to_argb_neon_wrapper;
case AV_PIX_FMT_RGBA: return yuva420p_to_rgba_neon_wrapper;
case AV_PIX_FMT_ABGR: return yuva420p_to_abgr_neon_wrapper;
case AV_PIX_FMT_BGRA: return yuva420p_to_bgra_neon_wrapper;
#endif
case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
/* 16bpp targets drop alpha, share yuv420p path */
case AV_PIX_FMT_RGB565LE: return yuv420p_to_rgb565le_neon_wrapper;
case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper;
case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper;
case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper;
case AV_PIX_FMT_RGB565BE: return yuv420p_to_rgb565be_neon_wrapper;
case AV_PIX_FMT_BGR565BE: return yuv420p_to_bgr565be_neon_wrapper;
case AV_PIX_FMT_RGB555BE: return yuv420p_to_rgb555be_neon_wrapper;
case AV_PIX_FMT_BGR555BE: return yuv420p_to_bgr555be_neon_wrapper;
}
} else if (c->opts.src_format == AV_PIX_FMT_YUV422P) {
switch (c->opts.dst_format) {
case AV_PIX_FMT_ARGB: return yuv422p_to_argb_neon_wrapper;
case AV_PIX_FMT_RGBA: return yuv422p_to_rgba_neon_wrapper;
case AV_PIX_FMT_ABGR: return yuv422p_to_abgr_neon_wrapper;
case AV_PIX_FMT_BGRA: return yuv422p_to_bgra_neon_wrapper;
case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper;
case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper;
case AV_PIX_FMT_GBRP: return yuv422p_to_gbrp_neon_wrapper;
case AV_PIX_FMT_RGB565LE: return yuv422p_to_rgb565le_neon_wrapper;
case AV_PIX_FMT_BGR565LE: return yuv422p_to_bgr565le_neon_wrapper;
case AV_PIX_FMT_RGB555LE: return yuv422p_to_rgb555le_neon_wrapper;
case AV_PIX_FMT_BGR555LE: return yuv422p_to_bgr555le_neon_wrapper;
case AV_PIX_FMT_RGB565BE: return yuv422p_to_rgb565be_neon_wrapper;
case AV_PIX_FMT_BGR565BE: return yuv422p_to_bgr565be_neon_wrapper;
case AV_PIX_FMT_RGB555BE: return yuv422p_to_rgb555be_neon_wrapper;
case AV_PIX_FMT_BGR555BE: return yuv422p_to_bgr555be_neon_wrapper;
}
}
return NULL;
}