mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-06-11 08:13:06 +00:00
swscale/aarch64: add NEON yuv->rgb16 fast paths
Add NEON unscaled converters for {yuv420p, yuv422p, yuva420p, nv12, nv21}
to {rgb565le, bgr565le, rgb555le, bgr555le}.
The 16bpp packing uses v8/v9 as the output accumulator. Since AAPCS-64
requires d8-d15 to be callee-saved, declare_func now wraps a
stp d8, d9 / ldp d8, d9 around 16bpp paths only (gated by .ifc on the
output format). Pattern matches libswscale/aarch64/hscale.S.
yuva420p -> 16bpp drops alpha and routes through the yuv420p wrappers,
mirroring how yuva420p -> rgb24/bgr24 already work in tree.
Speedup vs C at width=1920 on Apple M1 (checkasm --bench):
| input | rgb565le | bgr565le | rgb555le | bgr555le |
|----------|----------|----------|----------|----------|
| yuv420p | 3.69x | 3.68x | 3.28x | 3.31x |
| yuv422p | 4.70x | 4.70x | 4.32x | 4.35x |
| yuva420p | 3.67x | 3.66x | 3.32x | 3.27x |
NEON cycles are ~48 for planar and ~50.5 for semi-planar across all
four outputs. yuv422p shows the biggest speedup because its C
reference is the most expensive. 555 ratios trail 565 because the C
reference is faster for 555 (one fewer mask bit); NEON cycles are the
same. nv12/nv21 are bench-only (see the preceding checkasm commit) and
run at the same ~50.5 cycles.
This only handles the little endian forms of the 16 bit RGB formats.
Verified with checkasm --test=sw_yuv2rgb (110/110) and the full
checkasm regression (7657/7657) on Apple M1.
Signed-off-by: DROOdotFOO <drew@axol.io>
This commit is contained in:
committed by
Martin Storsjö
co-authored by
Martin Storsjö
parent
34501921fd
commit
2e142e52ae
@@ -95,6 +95,15 @@ DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24)
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuvx) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb565le) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr565le) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb555le) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr555le) \
|
||||
|
||||
DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv420p)
|
||||
DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv422p)
|
||||
|
||||
#define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt) \
|
||||
int ff_yuva420p_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
@@ -217,6 +226,15 @@ DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24)
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
|
||||
|
||||
#define DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nvx) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb565le) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr565le) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb555le) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr555le) \
|
||||
|
||||
DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv12)
|
||||
DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv21)
|
||||
|
||||
/* We need a 16 pixel width alignment. This constraint can easily be removed
|
||||
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
|
||||
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
|
||||
@@ -240,6 +258,13 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd); \
|
||||
} while (0)
|
||||
|
||||
#define SET_FF_NVX_TO_ALL_RGB16_FUNC(nvx, NVX, accurate_rnd) do { \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb565le, RGB565LE, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr565le, BGR565LE, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb555le, RGB555LE, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr555le, BGR555LE, accurate_rnd); \
|
||||
} while (0)
|
||||
|
||||
static void get_unscaled_swscale_neon(SwsInternal *c) {
|
||||
int accurate_rnd = c->opts.flags & SWS_ACCURATE_RND;
|
||||
|
||||
@@ -247,6 +272,10 @@ static void get_unscaled_swscale_neon(SwsInternal *c) {
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGB16_FUNC(nv12, NV12, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGB16_FUNC(nv21, NV21, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGB16_FUNC(yuv420p, YUV420P, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGB16_FUNC(yuv422p, YUV422P, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd);
|
||||
@@ -254,6 +283,11 @@ static void get_unscaled_swscale_neon(SwsInternal *c) {
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd);
|
||||
/* yuva420p -> 16bpp: alpha is dropped, route through yuv420p NEON path */
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565le, RGB565LE, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565le, BGR565LE, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555le, RGB555LE, accurate_rnd);
|
||||
SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555le, BGR555LE, accurate_rnd);
|
||||
|
||||
if (c->opts.dst_format == AV_PIX_FMT_YUV420P &&
|
||||
(c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) &&
|
||||
@@ -285,6 +319,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
|
||||
case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
|
||||
case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
|
||||
case AV_PIX_FMT_RGB565LE: return yuv420p_to_rgb565le_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper;
|
||||
case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper;
|
||||
}
|
||||
} else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
|
||||
switch (c->opts.dst_format) {
|
||||
@@ -297,6 +335,11 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
|
||||
case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
|
||||
case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
|
||||
/* 16bpp targets drop alpha, share yuv420p path */
|
||||
case AV_PIX_FMT_RGB565LE: return yuv420p_to_rgb565le_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper;
|
||||
case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper;
|
||||
}
|
||||
} else if (c->opts.src_format == AV_PIX_FMT_YUV422P) {
|
||||
switch (c->opts.dst_format) {
|
||||
@@ -307,6 +350,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
|
||||
case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper;
|
||||
case AV_PIX_FMT_GBRP: return yuv422p_to_gbrp_neon_wrapper;
|
||||
case AV_PIX_FMT_RGB565LE: return yuv422p_to_rgb565le_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR565LE: return yuv422p_to_bgr565le_neon_wrapper;
|
||||
case AV_PIX_FMT_RGB555LE: return yuv422p_to_rgb555le_neon_wrapper;
|
||||
case AV_PIX_FMT_BGR555LE: return yuv422p_to_bgr555le_neon_wrapper;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
@@ -63,7 +63,23 @@
|
||||
add w17, w0, w0, lsl #1
|
||||
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
|
||||
.else
|
||||
.ifc \ofmt,rgb565le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,bgr565le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,rgb555le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,bgr555le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
@@ -96,7 +112,23 @@
|
||||
add w17, w0, w0, lsl #1
|
||||
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
|
||||
.else
|
||||
.ifc \ofmt,rgb565le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,bgr565le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,rgb555le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,bgr555le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
@@ -139,7 +171,23 @@
|
||||
add w17, w0, w0, lsl #1
|
||||
sub w3, w3, w17 // w3 = linesize - width * 3 (padding)
|
||||
.else
|
||||
.ifc \ofmt,rgb565le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,bgr565le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,rgb555le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
.ifc \ofmt,bgr555le
|
||||
sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding)
|
||||
.else
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
@@ -230,9 +278,63 @@
|
||||
mov \a2, v29.8b // real alpha (next 8 pixels)
|
||||
.endm
|
||||
|
||||
// The 16bpp output paths use v8/v9 to assemble packed pixels before the
|
||||
// final st1. v8/v9 are AAPCS callee-saved (low 64 bits must be preserved),
|
||||
// so each function spills d8/d9 to the stack on entry and reloads on exit.
|
||||
// Other output formats don't touch v8-v15, so the save/restore is gated.
|
||||
.macro save_d8_d9_if_16bpp ofmt
|
||||
.ifc \ofmt,rgb565le
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
.ifc \ofmt,bgr565le
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
.ifc \ofmt,rgb555le
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
.ifc \ofmt,bgr555le
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro restore_d8_d9_if_16bpp ofmt
|
||||
.ifc \ofmt,rgb565le
|
||||
ldp d8, d9, [sp], #0x10
|
||||
.endif
|
||||
.ifc \ofmt,bgr565le
|
||||
ldp d8, d9, [sp], #0x10
|
||||
.endif
|
||||
.ifc \ofmt,rgb555le
|
||||
ldp d8, d9, [sp], #0x10
|
||||
.endif
|
||||
.ifc \ofmt,bgr555le
|
||||
ldp d8, d9, [sp], #0x10
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Pack 8 pixels of 16bpp output. The three channels are extracted via ushr,
|
||||
// widened to u16, then merged via shift-left-insert:
|
||||
// dst = (high << high_shl) | (mid << 5) | low
|
||||
// For RGB565LE pass (B, G, R) as (low, mid, high), g_shr=2, high_shl=11.
|
||||
// For BGR565LE pass (R, G, B), g_shr=2, high_shl=11.
|
||||
// For RGB555LE pass (B, G, R), g_shr=3, high_shl=10.
|
||||
// For BGR555LE pass (R, G, B), g_shr=3, high_shl=10.
|
||||
// Clobbers v20-v23.
|
||||
.macro pack_rgb16 dst, low_ch, mid_ch, high_ch, g_shr, high_shl
|
||||
ushr v20.8b, \high_ch\().8b, #3
|
||||
ushr v21.8b, \mid_ch\().8b, #\g_shr
|
||||
ushr v22.8b, \low_ch\().8b, #3
|
||||
uxtl \dst\().8h, v22.8b
|
||||
uxtl v23.8h, v21.8b
|
||||
sli \dst\().8h, v23.8h, #5
|
||||
uxtl v23.8h, v20.8b
|
||||
sli \dst\().8h, v23.8h, #\high_shl
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt \ofmt
|
||||
save_d8_d9_if_16bpp \ofmt
|
||||
|
||||
movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant)
|
||||
movi v30.8b, #255 // alpha = 255 (loop-invariant)
|
||||
@@ -313,8 +415,40 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
st1 { v6.8b, v7.8b }, [x10], #16
|
||||
st1 { v18.8b, v19.8b }, [x15], #16
|
||||
.else
|
||||
.ifc \ofmt,rgb565le
|
||||
compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
|
||||
// RGB565 LE: (R[7:3] << 11) | (G[7:2] << 5) | B[7:3]
|
||||
pack_rgb16 v8, v6, v5, v4, 2, 11
|
||||
pack_rgb16 v9, v18, v17, v16, 2, 11
|
||||
st1 { v8.8h, v9.8h}, [x2], #32
|
||||
.else
|
||||
.ifc \ofmt,bgr565le
|
||||
compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
|
||||
// BGR565 LE: (B[7:3] << 11) | (G[7:2] << 5) | R[7:3]
|
||||
pack_rgb16 v8, v4, v5, v6, 2, 11
|
||||
pack_rgb16 v9, v16, v17, v18, 2, 11
|
||||
st1 { v8.8h, v9.8h}, [x2], #32
|
||||
.else
|
||||
.ifc \ofmt,rgb555le
|
||||
compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
|
||||
// RGB555 LE: (R[7:3] << 10) | (G[7:3] << 5) | B[7:3]
|
||||
pack_rgb16 v8, v6, v5, v4, 3, 10
|
||||
pack_rgb16 v9, v18, v17, v16, 3, 10
|
||||
st1 { v8.8h, v9.8h}, [x2], #32
|
||||
.else
|
||||
.ifc \ofmt,bgr555le
|
||||
compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
|
||||
// BGR555 LE: (B[7:3] << 10) | (G[7:3] << 5) | R[7:3]
|
||||
pack_rgb16 v8, v4, v5, v6, 3, 10
|
||||
pack_rgb16 v9, v16, v17, v18, 3, 10
|
||||
st1 { v8.8h, v9.8h}, [x2], #32
|
||||
.else
|
||||
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
|
||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
@@ -330,6 +464,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
subs w1, w1, #1 // height -= 1
|
||||
b.gt 1b
|
||||
mov w0, w9
|
||||
restore_d8_d9_if_16bpp \ofmt
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
@@ -349,6 +484,18 @@ declare_rgb_funcs nv21
|
||||
declare_rgb_funcs yuv420p
|
||||
declare_rgb_funcs yuv422p
|
||||
|
||||
.macro declare_rgb16_funcs ifmt
|
||||
declare_func \ifmt, rgb565le
|
||||
declare_func \ifmt, bgr565le
|
||||
declare_func \ifmt, rgb555le
|
||||
declare_func \ifmt, bgr555le
|
||||
.endm
|
||||
|
||||
declare_rgb16_funcs nv12
|
||||
declare_rgb16_funcs nv21
|
||||
declare_rgb16_funcs yuv420p
|
||||
declare_rgb16_funcs yuv422p
|
||||
|
||||
.macro declare_yuva_funcs ifmt
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
|
||||
Reference in New Issue
Block a user