Files
ffmpeg/libswscale/aarch64/yuv2rgb_neon.S
T
DROOdotFOOandRamiro Polla cc7c567920 swscale/aarch64/yuv2rgb_neon: add BE 16bpp output formats
BE counterparts to the LE paths in 2e142e52ae; pack adds rev16 before
store. nv12/nv21 paths are added but bench-only (no C ref, same as
2e142e52ae).

Test Name                              A55-gcc           M1-clang             A76-gcc
-------------------------------------------------------------------------------------
yuv420p_rgb565be_1920_neon    15086.1 ( 3.91x)    5507.0 ( 4.34x)    19229.1 ( 2.02x)
yuv420p_bgr565be_1920_neon    15291.7 ( 3.84x)    5476.9 ( 4.37x)    19229.4 ( 2.02x)
yuv420p_rgb555be_1920_neon    15091.5 ( 3.67x)    5569.0 ( 3.97x)    19229.3 ( 1.90x)
yuv420p_bgr555be_1920_neon    15298.6 ( 3.62x)    5600.6 ( 3.98x)    19228.8 ( 1.90x)
yuv422p_rgb565be_1920_neon    16862.3 ( 4.00x)    6378.8 ( 4.64x)    22110.3 ( 2.07x)
yuv422p_bgr565be_1920_neon    17139.3 ( 3.93x)    6448.1 ( 4.50x)    22104.1 ( 2.07x)
yuv422p_rgb555be_1920_neon    16853.3 ( 3.98x)    6468.8 ( 4.12x)    22106.4 ( 1.98x)
yuv422p_bgr555be_1920_neon    17202.2 ( 3.89x)    6467.0 ( 4.12x)    22110.2 ( 1.98x)
yuva420p_rgb565be_1920_neon   15050.2 ( 3.92x)    5452.5 ( 4.39x)    19229.5 ( 2.02x)
yuva420p_bgr565be_1920_neon   15346.6 ( 3.84x)    5462.4 ( 4.36x)    19228.9 ( 2.02x)
yuva420p_rgb555be_1920_neon   15050.8 ( 3.69x)    5463.3 ( 3.95x)    19228.6 ( 1.90x)
yuva420p_bgr555be_1920_neon   15352.8 ( 3.61x)    5543.6 ( 3.89x)    19228.6 ( 1.90x)

Co-authored-by: Ramiro Polla <ramiro.polla@gmail.com>
Signed-off-by: DROOdotFOO <drew@axol.io>
2026-06-10 17:54:20 +00:00

1129 lines
43 KiB
ArmAsm

/*
* Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
* Copyright (c) 2026 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
// Calling convention for ff_<ifmt>_to_<ofmt>_neon:
// w0 int w (width, multiple of 16)
// w1 int h
// w2 int y_offset
// w3 int y_coeff
// x4 const int16_t *yuv2rgb_table
// x5 const uint8_t *const src[] (Y, U/C, V, A as needed)
// x6 const int *const srcStride[]
// x7 uint8_t *dst0
// [sp + 0] int linesize0
// [sp + 8] uint8_t *dst1 (planar only)
// [sp + 16] int linesize1 (planar only)
// [sp + 24] uint8_t *dst2 (planar only)
// [sp + 32] int linesize2 (planar only)
// Passing src/srcStride as arrays keeps every scalar arg in a register and
// leaves only pointer-followed-by-int on the stack, so Apple's natural
// packing and AAPCS64's 8-byte slotting coincide and no per-ABI offset
// branching is needed.
#define width w0
#define widthx x0
#define height w1
#define y_offset w2
#define y_coeff w3
#define table_ptr x4
// Source plane pointers (loaded from src[] in the prologue; the slots are
// reused for srcY/srcC/srcV/srcA once y_offset/y_coeff/table_ptr are
// consumed by dup/ld1).
#define srcY x2
#define srcC x3
#define srcU x3
#define srcV x4
#define srcA x5
// Source plane padding (sign-extended in the prologue so the row-end
// increment is a single 64-bit add).
#define srcPaddingY x10
#define srcPaddingC x11
#define srcPaddingU x11
#define srcPaddingV x12
#define srcPaddingA x6
#define srcPaddingYw w10
#define srcPaddingCw w11
#define srcPaddingUw w11
#define srcPaddingVw w12
#define srcPaddingAw w6
// Destination plane pointers. dst0/dst1/dst2 share x5/x6 with srcA/
// srcPaddingA, but those aliases never coexist in the same function (yuva
// is packed-only; gbrp is yuv420p/yuv422p/nv12/nv21).
#define dst0 x7
#define dst1 x6
#define dst2 x5
#define dstPadding0 x13
#define dstPadding1 x14
#define dstPadding2 x15
#define dstPadding0w w13
#define dstPadding1w w14
#define dstPadding2w w15
// Loop state.
#define cur_width w8
#define orig_height w9
#define chroma_rewind x16
#define tmp w17
#define tmpx x17
// Second-row scratch for the 2-lines-at-a-time paths. chroma_rewind and
// tmp are unused there (the chroma row is consumed by both output rows
// in the same iteration, so the rewind csel/add is gone), so x16/x17
// double as the line-2 luma and dst pointers.
#define l2_srcY x16
#define l2_dst0 x17
// Planar 2-line variant needs three line-2 dst pointers. x16/x17 are
// already taken by l2_srcY/l2_dst0, so l2_dst1/l2_dst2 land in the
// AAPCS callee-saved range and the 2-line gbrp prologue spills them.
#define l2_dst1 x19
#define l2_dst2 x20
// yuva420p 2-line carries a per-row alpha pointer (alpha is full
// resolution -- each output row reads its own 16 bytes). x14 is free
// for the yuva packed variants (no planar gbrp dst there).
#define l2_srcA x14
// --------------------------------------------------------------------
// Source-side argument unpacking.
.macro src_load_args_nv12
ldp srcPaddingYw, srcPaddingCw, [x6] // srcStride[0], srcStride[1]
ldp srcY, srcC, [x5] // src[0], src[1]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingC, srcPaddingCw
sub srcPaddingY, srcPaddingY, widthx // = srcStride[0] - width
sub srcPaddingC, srcPaddingC, widthx // = srcStride[1] - width (UV interleaved)
neg chroma_rewind, widthx // chroma_rewind = -width
.endm
.macro src_load_args_nv21
src_load_args_nv12
.endm
.macro src_load_args_yuv420p
ldp srcPaddingYw, srcPaddingUw, [x6] // srcStride[0], srcStride[1]
ldr srcPaddingVw, [x6, #8] // srcStride[2]
ldp srcY, srcU, [x5] // src[0], src[1]
ldr srcV, [x5, #16] // src[2]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingU, srcPaddingUw
sxtw srcPaddingV, srcPaddingVw
sub srcPaddingY, srcPaddingY, widthx // = srcStride[0] - width
sub srcPaddingU, srcPaddingU, widthx, lsr #1 // = srcStride[1] - width/2
sub srcPaddingV, srcPaddingV, widthx, lsr #1 // = srcStride[2] - width/2
neg chroma_rewind, widthx
asr chroma_rewind, chroma_rewind, #1 // chroma_rewind = -width/2
.endm
.macro src_load_args_yuv422p
ldp srcPaddingYw, srcPaddingUw, [x6]
ldr srcPaddingVw, [x6, #8]
ldp srcY, srcU, [x5]
ldr srcV, [x5, #16]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingU, srcPaddingUw
sxtw srcPaddingV, srcPaddingVw
sub srcPaddingY, srcPaddingY, widthx
sub srcPaddingU, srcPaddingU, widthx, lsr #1
sub srcPaddingV, srcPaddingV, widthx, lsr #1
.endm
.macro src_load_args_yuva420p
ldp srcPaddingYw, srcPaddingUw, [x6]
ldr srcPaddingVw, [x6, #8] // srcStride[2]
ldr srcPaddingAw, [x6, #12] // srcStride[3]
ldp srcY, srcU, [x5]
ldr srcV, [x5, #16]
ldr srcA, [x5, #24] // src[3]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingU, srcPaddingUw
sxtw srcPaddingV, srcPaddingVw
sxtw srcPaddingA, srcPaddingAw
sub srcPaddingY, srcPaddingY, widthx
sub srcPaddingU, srcPaddingU, widthx, lsr #1
sub srcPaddingV, srcPaddingV, widthx, lsr #1
sub srcPaddingA, srcPaddingA, widthx // alpha is full resolution
neg chroma_rewind, widthx
asr chroma_rewind, chroma_rewind, #1
.endm
// 2-lines-at-a-time variants: compute l2_srcY = srcY + srcStride[0]
// up front and pre-double srcPaddingY so the row-end increment advances
// both luma pointers by a full pair-stride. Chroma advances once per
// pair, so srcPaddingC/U/V are computed the same way as the single-row
// case. No chroma_rewind is needed (a chroma row is consumed by both
// output rows in the same inner iteration).
.macro src_load_args_nv12_2l
ldp srcPaddingYw, srcPaddingCw, [x6]
ldp srcY, srcC, [x5]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingC, srcPaddingCw
add l2_srcY, srcY, srcPaddingY // l2_srcY = srcY + linesizeY
lsl srcPaddingY, srcPaddingY, #1
sub srcPaddingY, srcPaddingY, widthx // = 2*linesizeY - width
sub srcPaddingC, srcPaddingC, widthx
.endm
.macro src_load_args_nv21_2l
src_load_args_nv12_2l
.endm
.macro src_load_args_yuv420p_2l
ldp srcPaddingYw, srcPaddingUw, [x6]
ldr srcPaddingVw, [x6, #8]
ldp srcY, srcU, [x5]
ldr srcV, [x5, #16]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingU, srcPaddingUw
sxtw srcPaddingV, srcPaddingVw
add l2_srcY, srcY, srcPaddingY // l2_srcY = srcY + linesizeY
lsl srcPaddingY, srcPaddingY, #1
sub srcPaddingY, srcPaddingY, widthx // = 2*linesizeY - width
sub srcPaddingU, srcPaddingU, widthx, lsr #1
sub srcPaddingV, srcPaddingV, widthx, lsr #1
.endm
.macro src_load_args_yuva420p_2l
ldp srcPaddingYw, srcPaddingUw, [x6]
ldr srcPaddingVw, [x6, #8]
ldr srcPaddingAw, [x6, #12] // srcStride[3]
ldp srcY, srcU, [x5]
ldr srcV, [x5, #16]
ldr srcA, [x5, #24] // src[3]
sxtw srcPaddingY, srcPaddingYw
sxtw srcPaddingU, srcPaddingUw
sxtw srcPaddingV, srcPaddingVw
sxtw srcPaddingA, srcPaddingAw
add l2_srcY, srcY, srcPaddingY // l2_srcY = srcY + linesizeY
add l2_srcA, srcA, srcPaddingA // l2_srcA = srcA + linesizeA
lsl srcPaddingY, srcPaddingY, #1
lsl srcPaddingA, srcPaddingA, #1
sub srcPaddingY, srcPaddingY, widthx // = 2*linesizeY - width
sub srcPaddingU, srcPaddingU, widthx, lsr #1
sub srcPaddingV, srcPaddingV, widthx, lsr #1
sub srcPaddingA, srcPaddingA, widthx // = 2*linesizeA - width
.endm
// --------------------------------------------------------------------
// Destination-side argument unpacking.
.macro dst_load_args_packed bpp
ldr dstPadding0w, [sp] // linesize0
sxtw dstPadding0, dstPadding0w
.ifc \bpp,2
sub dstPadding0, dstPadding0, widthx, lsl #1 // = linesize0 - width*2
.endif
.ifc \bpp,3
sub dstPadding0, dstPadding0, widthx, lsl #1
sub dstPadding0, dstPadding0, widthx // = linesize0 - width*3
.endif
.ifc \bpp,4
sub dstPadding0, dstPadding0, widthx, lsl #2 // = linesize0 - width*4
.endif
.endm
.macro dst_load_args_planar
ldr dstPadding0w, [sp] // linesize0
ldr dst1, [sp, #8] // dst1
ldr dstPadding1w, [sp, #16] // linesize1
ldr dst2, [sp, #24] // dst2
ldr dstPadding2w, [sp, #32] // linesize2
sxtw dstPadding0, dstPadding0w
sxtw dstPadding1, dstPadding1w
sxtw dstPadding2, dstPadding2w
sub dstPadding0, dstPadding0, widthx
sub dstPadding1, dstPadding1, widthx
sub dstPadding2, dstPadding2, widthx
.endm
.macro dst_load_args_argb
dst_load_args_packed 4
.endm
.macro dst_load_args_rgba
dst_load_args_packed 4
.endm
.macro dst_load_args_abgr
dst_load_args_packed 4
.endm
.macro dst_load_args_bgra
dst_load_args_packed 4
.endm
.macro dst_load_args_rgb24
dst_load_args_packed 3
.endm
.macro dst_load_args_bgr24
dst_load_args_packed 3
.endm
.macro dst_load_args_rgb565le
dst_load_args_packed 2
.endm
.macro dst_load_args_bgr565le
dst_load_args_packed 2
.endm
.macro dst_load_args_rgb555le
dst_load_args_packed 2
.endm
.macro dst_load_args_bgr555le
dst_load_args_packed 2
.endm
.macro dst_load_args_rgb565be
dst_load_args_packed 2
.endm
.macro dst_load_args_bgr565be
dst_load_args_packed 2
.endm
.macro dst_load_args_rgb555be
dst_load_args_packed 2
.endm
.macro dst_load_args_bgr555be
dst_load_args_packed 2
.endm
.macro dst_load_args_gbrp
dst_load_args_planar
.endm
// 2-lines-at-a-time dst loader. Pre-compute l2_dst = dst + linesize
// and pre-double dstPadding so the row-end advance covers both rows.
.macro dst_load_args_packed_2l bpp
ldr dstPadding0w, [sp] // linesize0
sxtw dstPadding0, dstPadding0w
add l2_dst0, dst0, dstPadding0 // l2_dst0 = dst0 + linesize0
lsl dstPadding0, dstPadding0, #1
.ifc \bpp,2
sub dstPadding0, dstPadding0, widthx, lsl #1 // = 2*linesize0 - width*2
.endif
.ifc \bpp,3
sub dstPadding0, dstPadding0, widthx, lsl #1
sub dstPadding0, dstPadding0, widthx // = 2*linesize0 - width*3
.endif
.ifc \bpp,4
sub dstPadding0, dstPadding0, widthx, lsl #2 // = 2*linesize0 - width*4
.endif
.endm
.macro dst_load_args_argb_2l
dst_load_args_packed_2l 4
.endm
.macro dst_load_args_rgba_2l
dst_load_args_packed_2l 4
.endm
.macro dst_load_args_abgr_2l
dst_load_args_packed_2l 4
.endm
.macro dst_load_args_bgra_2l
dst_load_args_packed_2l 4
.endm
.macro dst_load_args_rgb24_2l
dst_load_args_packed_2l 3
.endm
.macro dst_load_args_bgr24_2l
dst_load_args_packed_2l 3
.endm
.macro dst_load_args_rgb565le_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_bgr565le_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_rgb555le_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_bgr555le_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_rgb565be_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_bgr565be_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_rgb555be_2l
dst_load_args_packed_2l 2
.endm
.macro dst_load_args_bgr555be_2l
dst_load_args_packed_2l 2
.endm
// 2-lines-at-a-time planar dst loader. \sp_off is the byte offset at
// which the caller's [sp+0] arg now lives (i.e., however many bytes the
// caller pushed before invoking this macro). declare_2l_gbrp spills
// x19/x20 (16 bytes) and passes 16; the on-stack args end up at:
// [sp + sp_off + 0] int linesize0
// [sp + sp_off + 8] uint8_t *dst1
// [sp + sp_off + 16] int linesize1
// [sp + sp_off + 24] uint8_t *dst2
// [sp + sp_off + 32] int linesize2
.macro dst_load_args_planar_2l sp_off
ldr dstPadding0w, [sp, #(\sp_off + 0)]
ldr dst1, [sp, #(\sp_off + 8)]
ldr dstPadding1w, [sp, #(\sp_off + 16)]
ldr dst2, [sp, #(\sp_off + 24)]
ldr dstPadding2w, [sp, #(\sp_off + 32)]
sxtw dstPadding0, dstPadding0w
sxtw dstPadding1, dstPadding1w
sxtw dstPadding2, dstPadding2w
add l2_dst0, dst0, dstPadding0 // l2_dst0 = dst0 + linesize0
add l2_dst1, dst1, dstPadding1 // l2_dst1 = dst1 + linesize1
add l2_dst2, dst2, dstPadding2 // l2_dst2 = dst2 + linesize2
lsl dstPadding0, dstPadding0, #1
lsl dstPadding1, dstPadding1, #1
lsl dstPadding2, dstPadding2, #1
sub dstPadding0, dstPadding0, widthx // = 2*linesize0 - width
sub dstPadding1, dstPadding1, widthx
sub dstPadding2, dstPadding2, widthx
.endm
// --------------------------------------------------------------------
// Per-input chroma load (run inside the inner loop).
.macro load_chroma_nv12
ld2 {v16.8b, v17.8b}, [srcC], #16
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_nv21
ld2 {v16.8b, v17.8b}, [srcC], #16
ushll v19.8h, v16.8b, #3
ushll v18.8h, v17.8b, #3
.endm
.macro load_chroma_yuv420p
ld1 {v16.8b}, [srcU], #8
ld1 {v17.8b}, [srcV], #8
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_yuva420p
load_chroma_yuv420p
.endm
.macro load_chroma_yuv422p
load_chroma_yuv420p
.endm
// --------------------------------------------------------------------
// Row-end chroma increments (single-row code shares one chroma row
// between two consecutive output rows by rewinding on even rows).
.macro increment_nv12
ands tmp, height, #1
csel tmpx, srcPaddingC, chroma_rewind, ne // incC = (h & 1) ? srcPaddingC : -width
add srcC, srcC, tmpx // srcC += incC
.endm
.macro increment_nv21
increment_nv12
.endm
.macro increment_yuv420p
ands tmp, height, #1
csel tmpx, srcPaddingU, chroma_rewind, ne // incU = (h & 1) ? srcPaddingU : -width/2
add srcU, srcU, tmpx // srcU += incU
csel tmpx, srcPaddingV, chroma_rewind, ne // incV = (h & 1) ? srcPaddingV : -width/2
add srcV, srcV, tmpx // srcV += incV
.endm
.macro increment_yuva420p
increment_yuv420p
add srcA, srcA, srcPaddingA // srcA += srcPaddingA (every row)
.endm
.macro increment_yuv422p
add srcU, srcU, srcPaddingU // srcU += srcPaddingU
add srcV, srcV, srcPaddingV // srcV += srcPaddingV
.endm
// 2-lines-at-a-time row-end increments. srcPaddingY already covers two
// luma rows; chroma advances by a single chroma row per pair.
.macro src_increment_nv12_2l
add srcY, srcY, srcPaddingY
add l2_srcY, l2_srcY, srcPaddingY
add srcC, srcC, srcPaddingC
.endm
.macro src_increment_nv21_2l
src_increment_nv12_2l
.endm
.macro src_increment_yuv420p_2l
add srcY, srcY, srcPaddingY
add l2_srcY, l2_srcY, srcPaddingY
add srcU, srcU, srcPaddingU
add srcV, srcV, srcPaddingV
.endm
.macro src_increment_yuva420p_2l
add srcY, srcY, srcPaddingY
add l2_srcY, l2_srcY, srcPaddingY
add srcU, srcU, srcPaddingU
add srcV, srcV, srcPaddingV
add srcA, srcA, srcPaddingA
add l2_srcA, l2_srcA, srcPaddingA
.endm
.macro dst_increment_packed_2l
add dst0, dst0, dstPadding0
add l2_dst0, l2_dst0, dstPadding0
.endm
.macro dst_increment_planar_2l
add dst0, dst0, dstPadding0
add l2_dst0, l2_dst0, dstPadding0
add dst1, dst1, dstPadding1
add l2_dst1, l2_dst1, dstPadding1
add dst2, dst2, dstPadding2
add l2_dst2, l2_dst2, dstPadding2
.endm
// --------------------------------------------------------------------
// Shared compute / pack helpers.
.macro compute_rgb r1 g1 b1 r2 g2 b2
add \r1\().8h, v26.8h, v20.8h // Y1 + R1
add \r2\().8h, v27.8h, v21.8h // Y2 + R2
add \g1\().8h, v26.8h, v22.8h // Y1 + G1
add \g2\().8h, v27.8h, v23.8h // Y2 + G2
add \b1\().8h, v26.8h, v24.8h // Y1 + B1
add \b2\().8h, v27.8h, v25.8h // Y2 + B2
sqrshrun \r1\().8b, \r1\().8h, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2\().8b, \r2\().8h, #1 // clip_u8((Y2 + R2) >> 1)
sqrshrun \g1\().8b, \g1\().8h, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2\().8b, \g2\().8h, #1 // clip_u8((Y2 + G2) >> 1)
sqrshrun \b1\().8b, \b1\().8h, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2\().8b, \b2\().8h, #1 // clip_u8((Y2 + B2) >> 1)
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
compute_rgb \r1, \g1, \b1, \r2, \g2, \b2
mov \a1\().8b, v30.8b
mov \a2\().8b, v30.8b
.endm
.macro compute_rgba_alpha r1 g1 b1 a1 r2 g2 b2 a2
compute_rgb \r1, \g1, \b1, \r2, \g2, \b2
mov \a1\().8b, v28.8b // real alpha (first 8 pixels)
mov \a2\().8b, v29.8b // real alpha (next 8 pixels)
.endm
// Chroma -> RGB offsets, computed once per pixel column for both luma rows.
// In: v18/v19 (widened chroma from load_chroma_<ifmt>).
// Out: v20-v25 (R1, R2, G1, G2, B1, B2).
.macro chroma_to_rgb_offsets
sub v18.8h, v18.8h, v31.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v31.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8h, v20.8h, v20.8h // R1
zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8h, v24.8h, v24.8h // B1
.endm
// Load and scale 16 luma samples from \rsrcY into v26 (Y1) / v27 (Y2).
// v0 = y_coeff, v3 = y_offset (loop-invariant).
.macro load_luma rsrcY
ld1 {v2.16b}, [\rsrcY], #16 // load luma
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
sqdmulh v26.8h, v26.8h, v0.8h // (Y1 * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // (Y2 * y_coeff) >> 15
.endm
// Process one output row: load 16 luma px from \rsrcY, combine with the
// shared chroma offsets in v20-v25, and store 16 px in format \ofmt.
// Packed callers pass the same dst three times.
.macro process_row ifmt, ofmt, rsrcY, rsrcA, rdst0, rdst1, rdst2
set_rgb16_predicates \ofmt
load_luma \rsrcY
.ifc \ifmt,yuva420p
ld1 {v28.8b, v29.8b}, [\rsrcA], #16 // 16 alpha bytes
.endif
.ifc \ofmt,argb // a r g b
compute_rgb v5, v6, v7, v17, v18, v19
.ifc \ifmt,yuva420p
mov v4.8b, v28.8b
mov v16.8b, v29.8b
.else
mov v4.8b, v30.8b
mov v16.8b, v30.8b
.endif
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
.endif
.ifc \ofmt,rgba // r g b a
compute_rgb v4, v5, v6, v16, v17, v18
.ifc \ifmt,yuva420p
mov v7.8b, v28.8b
mov v19.8b, v29.8b
.else
mov v7.8b, v30.8b
mov v19.8b, v30.8b
.endif
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
.endif
.ifc \ofmt,abgr // a b g r
compute_rgb v7, v6, v5, v19, v18, v17
.ifc \ifmt,yuva420p
mov v4.8b, v28.8b
mov v16.8b, v29.8b
.else
mov v4.8b, v30.8b
mov v16.8b, v30.8b
.endif
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
.endif
.ifc \ofmt,bgra // b g r a
compute_rgb v6, v5, v4, v18, v17, v16
.ifc \ifmt,yuva420p
mov v7.8b, v28.8b
mov v19.8b, v29.8b
.else
mov v7.8b, v30.8b
mov v19.8b, v30.8b
.endif
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
.endif
.ifc \ofmt,rgb24
compute_rgb v4, v5, v6, v16, v17, v18
st3 { v4.8b, v5.8b, v6.8b}, [\rdst0], #24
st3 {v16.8b,v17.8b,v18.8b}, [\rdst0], #24
.endif
.ifc \ofmt,bgr24
compute_rgb v6, v5, v4, v18, v17, v16
st3 { v4.8b, v5.8b, v6.8b}, [\rdst0], #24
st3 {v16.8b,v17.8b,v18.8b}, [\rdst0], #24
.endif
.ifc \ofmt,gbrp
compute_rgb v18, v4, v6, v19, v5, v7
st1 { v4.8b, v5.8b }, [\rdst0], #16
st1 { v6.8b, v7.8b }, [\rdst1], #16
st1 { v18.8b, v19.8b }, [\rdst2], #16
.endif
.if rgb16
.ifc \ifmt,yuva420p
.error "yuva420p->rgb16 is dispatched through the yuv420p path (rgb16 has no alpha channel)"
.endif
compute_rgb v4, v5, v6, v16, v17, v18
.if r_first
// rgb*: (R << hshift) | (G << 5) | B
pack_rgb16_2l v8, v6, v5, v4, gshift, hshift
pack_rgb16_2l v9, v18, v17, v16, gshift, hshift
.else
// bgr*: (B << hshift) | (G << 5) | R
pack_rgb16_2l v8, v4, v5, v6, gshift, hshift
pack_rgb16_2l v9, v16, v17, v18, gshift, hshift
.endif
st1 { v8.8h, v9.8h}, [\rdst0], #32
.endif
.endm
// Map ofmt to .set predicates: rgb16=1 for the eight 16bpp ofmts
// (r_first=1 for rgb*, 0 for bgr*; gshift/hshift = 2/11 for 565,
// 3/10 for 555; is_be=1 for the BE variants), letting sibling macros
// branch on .if rgb16 / .if is_be instead of repeating .ifc cascades.
.macro set_rgb16_predicates ofmt
.set rgb16, 0
.set r_first, 0
.set gshift, 0
.set hshift, 0
.set is_be, 0
.ifc \ofmt,rgb565le
.set rgb16, 1
.set r_first, 1
.set gshift, 2
.set hshift, 11
.endif
.ifc \ofmt,bgr565le
.set rgb16, 1
.set gshift, 2
.set hshift, 11
.endif
.ifc \ofmt,rgb555le
.set rgb16, 1
.set r_first, 1
.set gshift, 3
.set hshift, 10
.endif
.ifc \ofmt,bgr555le
.set rgb16, 1
.set gshift, 3
.set hshift, 10
.endif
.ifc \ofmt,rgb565be
.set rgb16, 1
.set r_first, 1
.set gshift, 2
.set hshift, 11
.set is_be, 1
.endif
.ifc \ofmt,bgr565be
.set rgb16, 1
.set gshift, 2
.set hshift, 11
.set is_be, 1
.endif
.ifc \ofmt,rgb555be
.set rgb16, 1
.set r_first, 1
.set gshift, 3
.set hshift, 10
.set is_be, 1
.endif
.ifc \ofmt,bgr555be
.set rgb16, 1
.set gshift, 3
.set hshift, 10
.set is_be, 1
.endif
.endm
// 16bpp packing uses v8/v9 as the accumulator. AAPCS-64 requires d8/d9
// callee-saved (low 64 bits of v8/v9); other ofmts don't touch v8-v15,
// so the spill is gated on rgb16.
.macro save_d8_d9_if_16bpp ofmt
set_rgb16_predicates \ofmt
.if rgb16
stp d8, d9, [sp, #-0x10]!
.endif
.endm
.macro restore_d8_d9_if_16bpp ofmt
set_rgb16_predicates \ofmt
.if rgb16
ldp d8, d9, [sp], #0x10
.endif
.endm
// Pack 8 pixels of 16bpp output. The three channels are extracted via ushr,
// widened to u16, then merged via shift-left-insert:
// dst = (high << high_shl) | (mid << 5) | low
// For RGB565LE/BE pass (B, G, R) as (low, mid, high), g_shr=2, high_shl=11.
// For BGR565LE/BE pass (R, G, B), g_shr=2, high_shl=11.
// For RGB555LE/BE pass (B, G, R), g_shr=3, high_shl=10.
// For BGR555LE/BE pass (R, G, B), g_shr=3, high_shl=10.
// Clobbers v20-v23.
.macro pack_rgb16 dst, low_ch, mid_ch, high_ch, g_shr, high_shl
ushr v20.8b, \high_ch\().8b, #3
ushr v21.8b, \mid_ch\().8b, #\g_shr
ushr v22.8b, \low_ch\().8b, #3
uxtl \dst\().8h, v22.8b
uxtl v23.8h, v21.8b
sli \dst\().8h, v23.8h, #5
uxtl v23.8h, v20.8b
sli \dst\().8h, v23.8h, #\high_shl
.if is_be
rev16 \dst\().16b, \dst\().16b
.endif
.endm
// As pack_rgb16 but uses v26-v29 as scratch (luma temps, dead after
// compute_rgb), so v20-v25 chroma contributions survive for the
// second luma row. yuva420p->rgb16 is dispatched through the yuv420p
// path, so v28/v29 aliasing alpha is not a concern here.
.macro pack_rgb16_2l dst, low_ch, mid_ch, high_ch, g_shr, high_shl
ushr v26.8b, \high_ch\().8b, #3
ushr v27.8b, \mid_ch\().8b, #\g_shr
ushr v28.8b, \low_ch\().8b, #3
uxtl \dst\().8h, v28.8b
uxtl v29.8h, v27.8b
sli \dst\().8h, v29.8h, #5
uxtl v29.8h, v26.8b
sli \dst\().8h, v29.8h, #\high_shl
.if is_be
rev16 \dst\().16b, \dst\().16b
.endif
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
uxtw widthx, width // ensure upper 32 bits of widthx are zero
dup v3.8h, y_offset // broadcast y_offset before w2 is reused
dup v0.8h, y_coeff // broadcast y_coeff before w3 is reused
ld1 {v1.1d}, [table_ptr] // load yuv2rgb_table before x4 is reused
src_load_args_\ifmt
dst_load_args_\ofmt
save_d8_d9_if_16bpp \ofmt
movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant)
movi v30.8b, #255 // alpha = 255 (loop-invariant)
mov orig_height, height
1:
mov cur_width, width
2:
load_chroma_\ifmt
sub v18.8h, v18.8h, v31.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v31.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
ld1 {v2.16b}, [srcY], #16 // load luma (interleaved)
.ifc \ifmt,yuva420p
ld1 {v28.8b, v29.8b}, [srcA], #16 // load 16 alpha bytes
.endif
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8h, v20.8h, v20.8h // R1
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8h, v24.8h, v24.8h // B1
.ifc \ofmt,argb // 1 2 3 0
.ifc \ifmt,yuva420p
compute_rgba_alpha v5,v6,v7,v4, v17,v18,v19,v16
.else
compute_rgba v5,v6,v7,v4, v17,v18,v19,v16
.endif
.endif
.ifc \ofmt,rgba // 0 1 2 3
.ifc \ifmt,yuva420p
compute_rgba_alpha v4,v5,v6,v7, v16,v17,v18,v19
.else
compute_rgba v4,v5,v6,v7, v16,v17,v18,v19
.endif
.endif
.ifc \ofmt,abgr // 3 2 1 0
.ifc \ifmt,yuva420p
compute_rgba_alpha v7,v6,v5,v4, v19,v18,v17,v16
.else
compute_rgba v7,v6,v5,v4, v19,v18,v17,v16
.endif
.endif
.ifc \ofmt,bgra // 2 1 0 3
.ifc \ifmt,yuva420p
compute_rgba_alpha v6,v5,v4,v7, v18,v17,v16,v19
.else
compute_rgba v6,v5,v4,v7, v18,v17,v16,v19
.endif
.endif
.ifc \ofmt,rgb24
compute_rgb v4,v5,v6, v16,v17,v18
st3 { v4.8b, v5.8b, v6.8b}, [dst0], #24
st3 {v16.8b,v17.8b,v18.8b}, [dst0], #24
.else
.ifc \ofmt,bgr24
compute_rgb v6,v5,v4, v18,v17,v16
st3 { v4.8b, v5.8b, v6.8b}, [dst0], #24
st3 {v16.8b,v17.8b,v18.8b}, [dst0], #24
.else
.ifc \ofmt,gbrp
compute_rgb v18,v4,v6, v19,v5,v7
st1 { v4.8b, v5.8b }, [dst0], #16
st1 { v6.8b, v7.8b }, [dst1], #16
st1 { v18.8b, v19.8b }, [dst2], #16
.else
.if rgb16
compute_rgb v4,v5,v6, v16,v17,v18
.if r_first
// rgb*: (R << hshift) | (G << 5) | B
pack_rgb16 v8, v6, v5, v4, gshift, hshift
pack_rgb16 v9, v18, v17, v16, gshift, hshift
.else
// bgr*: (B << hshift) | (G << 5) | R
pack_rgb16 v8, v4, v5, v6, gshift, hshift
pack_rgb16 v9, v16, v17, v18, gshift, hshift
.endif
st1 { v8.8h, v9.8h}, [dst0], #32
.else
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [dst0], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [dst0], #32
.endif
.endif
.endif
.endif
subs cur_width, cur_width, #16 // cur_width -= 16
b.gt 2b
add dst0, dst0, dstPadding0 // dst0 += padding
.ifc \ofmt,gbrp
add dst1, dst1, dstPadding1 // dst1 += padding1
add dst2, dst2, dstPadding2 // dst2 += padding2
.endif
add srcY, srcY, srcPaddingY // srcY += paddingY
increment_\ifmt
subs height, height, #1 // height -= 1
b.gt 1b
mov w0, orig_height // return orig_height
restore_d8_d9_if_16bpp \ofmt
ret
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
declare_func \ifmt, gbrp
declare_func \ifmt, rgb24
declare_func \ifmt, bgr24
.endm
// 2-lines-at-a-time variant of declare_func for the single-dst-pointer
// packed outputs (argb/rgba/abgr/bgra/rgb24/bgr24) with vertically-
// subsampled inputs (nv12/nv21/yuv420p). Two consecutive output rows
// share one chroma row, so the chroma -> RGB offsets (v20-v25) are
// computed once and applied to both luma rows.
//
// Precondition: slice height is even. SET_FF_YUVX_TO_RGBX_FUNC gates
// on !(src_h & 1); scale_internal()'s macro_height_src check in
// libswscale/swscale.c rejects any odd srcSliceH for vertically-
// subsampled sources (chrSrcVSubSample > 0).
.macro declare_2l_packed ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
uxtw widthx, width
dup v3.8h, y_offset
dup v0.8h, y_coeff
ld1 {v1.1d}, [table_ptr]
src_load_args_\ifmt\()_2l
dst_load_args_\ofmt\()_2l
save_d8_d9_if_16bpp \ofmt
movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant)
movi v30.8b, #255 // alpha = 255 (loop-invariant)
mov orig_height, height
1:
mov cur_width, width
2:
load_chroma_\ifmt
chroma_to_rgb_offsets
process_row \ifmt, \ofmt, srcY, srcY, dst0, dst0, dst0
process_row \ifmt, \ofmt, l2_srcY, l2_srcY, l2_dst0, l2_dst0, l2_dst0
subs cur_width, cur_width, #16
b.gt 2b
dst_increment_packed_2l
src_increment_\ifmt\()_2l
subs height, height, #2
b.gt 1b
mov w0, orig_height
restore_d8_d9_if_16bpp \ofmt
ret
endfunc
.endm
.macro declare_rgb_funcs_2l_packed ifmt
declare_2l_packed \ifmt, argb
declare_2l_packed \ifmt, rgba
declare_2l_packed \ifmt, abgr
declare_2l_packed \ifmt, bgra
declare_2l_packed \ifmt, rgb24
declare_2l_packed \ifmt, bgr24
.endm
// 2-lines-at-a-time variant for the gbrp planar output. Six dst pointers
// (three per row) exhaust the caller-saved registers, so x19/x20 are
// spilled AAPCS-style. Stack args for the line-1 dst1/dst2/linesize are
// read after the spill, so dst_load_args_planar_2l uses the shifted
// offsets.
.macro declare_2l_gbrp ifmt
function ff_\ifmt\()_to_gbrp_neon, export=1
uxtw widthx, width
dup v3.8h, y_offset
dup v0.8h, y_coeff
ld1 {v1.1d}, [table_ptr]
stp x19, x20, [sp, #-0x10]! // callee-saved (line2 planar ptrs)
src_load_args_\ifmt\()_2l
dst_load_args_planar_2l 16 // 16 = bytes pushed above
movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant)
mov orig_height, height
1:
mov cur_width, width
2:
load_chroma_\ifmt
chroma_to_rgb_offsets
process_row \ifmt, gbrp, srcY, srcY, dst0, dst1, dst2
process_row \ifmt, gbrp, l2_srcY, l2_srcY, l2_dst0, l2_dst1, l2_dst2
subs cur_width, cur_width, #16
b.gt 2b
dst_increment_planar_2l
src_increment_\ifmt\()_2l
subs height, height, #2
b.gt 1b
mov w0, orig_height
ldp x19, x20, [sp], #0x10 // restore callee-saved
ret
endfunc
.endm
// Vertically-subsampled inputs: both packed RGB and gbrp go through the
// 2-lines path. yuv422p has full-height chroma -- no sharing, so it
// keeps the single-row path for every ofmt.
declare_rgb_funcs_2l_packed nv12
declare_2l_gbrp nv12
declare_rgb_funcs_2l_packed nv21
declare_2l_gbrp nv21
declare_rgb_funcs_2l_packed yuv420p
declare_2l_gbrp yuv420p
declare_rgb_funcs yuv422p
.macro declare_rgb16le_funcs ifmt
declare_func \ifmt, rgb565le
declare_func \ifmt, bgr565le
declare_func \ifmt, rgb555le
declare_func \ifmt, bgr555le
.endm
.macro declare_rgb16le_funcs_2l ifmt
declare_2l_packed \ifmt, rgb565le
declare_2l_packed \ifmt, bgr565le
declare_2l_packed \ifmt, rgb555le
declare_2l_packed \ifmt, bgr555le
.endm
.macro declare_rgb16be_funcs ifmt
declare_func \ifmt, rgb565be
declare_func \ifmt, bgr565be
declare_func \ifmt, rgb555be
declare_func \ifmt, bgr555be
.endm
.macro declare_rgb16be_funcs_2l ifmt
declare_2l_packed \ifmt, rgb565be
declare_2l_packed \ifmt, bgr565be
declare_2l_packed \ifmt, rgb555be
declare_2l_packed \ifmt, bgr555be
.endm
// Subsampled inputs take the 2-line rgb16 path; yuv422p stays single-row.
declare_rgb16le_funcs_2l nv12
declare_rgb16be_funcs_2l nv12
declare_rgb16le_funcs_2l nv21
declare_rgb16be_funcs_2l nv21
declare_rgb16le_funcs_2l yuv420p
declare_rgb16be_funcs_2l yuv420p
declare_rgb16le_funcs yuv422p
declare_rgb16be_funcs yuv422p
.macro declare_yuva_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
// 2-lines-at-a-time path for yuva420p -> {argb,rgba,abgr,bgra}. Chroma
// is vertically subsampled and shared between the two output rows; the
// alpha plane is full resolution, so each row loads its own 16 alpha
// bytes via process_row's \rsrcA arg (srcA / l2_srcA). The constant
// alpha (v30) is never read in this path, so its prologue movi is
// omitted.
.macro declare_2l_yuva ofmt
.ifc \ofmt,gbrp
.error "yuva420p->gbrp is dispatched through the yuv420p path (gbrp has no alpha channel)"
.endif
function ff_yuva420p_to_\ofmt\()_neon, export=1
uxtw widthx, width
dup v3.8h, y_offset
dup v0.8h, y_coeff
ld1 {v1.1d}, [table_ptr]
src_load_args_yuva420p_2l
dst_load_args_\ofmt\()_2l
movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant)
mov orig_height, height
1:
mov cur_width, width
2:
load_chroma_yuv420p
chroma_to_rgb_offsets
process_row yuva420p, \ofmt, srcY, srcA, dst0, dst0, dst0
process_row yuva420p, \ofmt, l2_srcY, l2_srcA, l2_dst0, l2_dst0, l2_dst0
subs cur_width, cur_width, #16
b.gt 2b
dst_increment_packed_2l
src_increment_yuva420p_2l
subs height, height, #2
b.gt 1b
mov w0, orig_height
ret
endfunc
.endm
.macro declare_yuva_funcs_2l
declare_2l_yuva argb
declare_2l_yuva rgba
declare_2l_yuva abgr
declare_2l_yuva bgra
.endm
declare_yuva_funcs_2l