libswscale/arm: Switch consistent indentation to common style

Some of these files aligned instructions to 4/24 columns, while
we commonly indent arm/aarch64 assembly to 8/24 columns.
Some of these files also used a different alignment for the
operands.
This commit is contained in:
Martin Storsjö
2026-04-29 13:49:27 +03:00
parent c5a3cb00b7
commit 9653588441
6 changed files with 416 additions and 416 deletions
+44 -44
View File
@@ -22,48 +22,48 @@
#include "libavutil/arm/asm.S"
function ff_hscale_8_to_15_neon, export=1
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ filter
ldr r5, [sp, #108] @ filterPos
ldr r6, [sp, #112] @ filterSize
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
1: ldr r8, [r5], #4 @ filterPos[0]
ldr r9, [r5], #4 @ filterPos[1]
vmov.s32 q4, #0 @ val accumulator
vmov.s32 q5, #0 @ val accumulator
mov r7, r6 @ tmpfilterSize = filterSize
mov r0, r3 @ srcp
2: add r11, r0, r8 @ srcp + filterPos[0]
add r12, r0, r9 @ srcp + filterPos[1]
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
vmovl.u8 q0, d0 @ unpack src values to 16-bit
vmovl.u8 q1, d2 @ unpack src values to 16-bit
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vadd.s32 q4, q8 @ update val accumulator
vadd.s32 q5, q10 @ update val accumulator
add r0, #8 @ srcp += 8
subs r7, #8 @ tmpfilterSize -= 8
bgt 2b @ loop until tmpfilterSize is consumed
mov r4, r10 @ filter = filter2
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
vst1.32 {d8[0]},[r1]! @ write destination
subs r2, #2 @ dstW -= 2
bgt 1b @ loop until end of line
vpop {q4-q7}
pop {r4-r12, pc}
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ filter
ldr r5, [sp, #108] @ filterPos
ldr r6, [sp, #112] @ filterSize
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
1: ldr r8, [r5], #4 @ filterPos[0]
ldr r9, [r5], #4 @ filterPos[1]
vmov.s32 q4, #0 @ val accumulator
vmov.s32 q5, #0 @ val accumulator
mov r7, r6 @ tmpfilterSize = filterSize
mov r0, r3 @ srcp
2: add r11, r0, r8 @ srcp + filterPos[0]
add r12, r0, r9 @ srcp + filterPos[1]
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
vmovl.u8 q0, d0 @ unpack src values to 16-bit
vmovl.u8 q1, d2 @ unpack src values to 16-bit
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
vadd.s32 q4, q8 @ update val accumulator
vadd.s32 q5, q10 @ update val accumulator
add r0, #8 @ srcp += 8
subs r7, #8 @ tmpfilterSize -= 8
bgt 2b @ loop until tmpfilterSize is consumed
mov r4, r10 @ filter = filter2
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
vst1.32 {d8[0]},[r1]! @ write destination
subs r2, #2 @ dstW -= 2
bgt 1b @ loop until end of line
vpop {q4-q7}
pop {r4-r12, pc}
endfunc
+52 -52
View File
@@ -22,56 +22,56 @@
#include "libavutil/arm/asm.S"
function ff_yuv2planeX_8_neon, export=1
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ dstW
ldr r5, [sp, #108] @ dither
ldr r6, [sp, #112] @ offset
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
cmp r6, #0 @ check offsetting which can be 0 or 3 only
beq 1f
vext.u8 d0, d0, d0, #3 @ honor offsetting which can be 3 only
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
mov r7, #0 @ i = 0
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
mov r8, r1 @ tmpFilterSize = filterSize
mov r9, r2 @ srcp
mov r10, r0 @ filterp
3: ldr r11, [r9], #4 @ get pointer @ src[j]
ldr r12, [r9], #4 @ get pointer @ src[j+1]
add r11, r11, r7, lsl #1 @ &src[j][i]
add r12, r12, r7, lsl #1 @ &src[j+1][i]
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
vadd.s32 q3, q5 @ update val accumulator (part 1)
vadd.s32 q4, q6 @ update val accumulator (part 2)
subs r8, #2 @ tmpFilterSize -= 2
bgt 3b @ loop until filterSize is consumed
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
vqmovn.u16 d6, q3 @ merge part 1 and part 2
vst1.8 {d6}, [r3]! @ write destination
add r7, #8 @ i += 8
subs r4, r4, #8 @ dstW -= 8
bgt 2b @ loop until width is consumed
vpop {q4-q7}
pop {r4-r12, pc}
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ dstW
ldr r5, [sp, #108] @ dither
ldr r6, [sp, #112] @ offset
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
cmp r6, #0 @ check offsetting which can be 0 or 3 only
beq 1f
vext.u8 d0, d0, d0, #3 @ honor offsetting which can be 3 only
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
mov r7, #0 @ i = 0
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
mov r8, r1 @ tmpFilterSize = filterSize
mov r9, r2 @ srcp
mov r10, r0 @ filterp
3: ldr r11, [r9], #4 @ get pointer @ src[j]
ldr r12, [r9], #4 @ get pointer @ src[j+1]
add r11, r11, r7, lsl #1 @ &src[j][i]
add r12, r12, r7, lsl #1 @ &src[j+1][i]
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
vadd.s32 q3, q5 @ update val accumulator (part 1)
vadd.s32 q4, q6 @ update val accumulator (part 2)
subs r8, #2 @ tmpFilterSize -= 2
bgt 3b @ loop until filterSize is consumed
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
vqmovn.u16 d6, q3 @ merge part 1 and part 2
vst1.8 {d6}, [r3]! @ write destination
add r7, #8 @ i += 8
subs r4, r4, #8 @ dstW -= 8
bgt 2b @ loop until width is consumed
vpop {q4-q7}
pop {r4-r12, pc}
endfunc
+23 -23
View File
@@ -36,34 +36,34 @@ alias y16x16_h, q14
alias_qw y8x16, q15
.macro init src
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
.endm
.macro compute_y_16x1
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
vadd.u8 y8x16, y8x16, BIAS_Y
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c16x8, q15
@@ -71,13 +71,13 @@ alias_qw c8x8x2, q10
.macro compute_chroma_8x1 c, C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
#endif
+33 -33
View File
@@ -48,27 +48,27 @@ alias y8x16, y16x16_e
.macro init src
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
.endm
/*
@@ -77,17 +77,17 @@ alias y8x16, y16x16_e
* clobber: q11-q15, r8x16, g8x16, b8x16
*/
.macro compute_y_16x1
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c32x8_l, q14
@@ -97,8 +97,8 @@ alias_qw c16x8, q13
alias_qw c8x8x2, q10
.macro compute_chroma_8x1_step action, s16x8, coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
.endm
/*
@@ -107,16 +107,16 @@ alias_qw c8x8x2, q10
* clobber: q14-q15
*/
.macro compute_chroma_8x1 c, C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
#endif
+109 -109
View File
@@ -31,10 +31,10 @@
.altmacro
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
@@ -43,23 +43,23 @@ alias_dw_all 0, 0, 1
.noaltmacro
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
push {r4-r12, lr}
vpush {q4-q7}
.endm
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
vpop {q4-q7}
pop {r4-r12, pc}
.endm
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
@@ -69,167 +69,167 @@ alias_dw_all 0, 0, 1
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias y0_end, r8, \set
alias y0_end, r8, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src1, r11, \set
alias y1, r12, \set
alias src1, r11, \set
alias y1, r12, \set
alias coeff_table,r12, \set
alias coeff_table,r12, \set
.endm
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
prologue
alias_loop_420sp
alias_loop_420sp
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
\init coeff_table
\init coeff_table
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, lsl #2
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, lsl #2
add y0_end, y0, width
and header, width, #15
add y0_end, y0, width
and header, width, #15
add y1, y0, y_stride
add src1, src0, src_stride
add y1, y0, y_stride
add src1, src0, src_stride
0:
cmp header, #0
beq 1f
cmp header, #0
beq 1f
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
cmp y0, y0_end
blt 1b
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y1, y0, y_stride
add src1, src0, src_stride
add y1, y0, y_stride
add src1, src0, src_stride
subs height, height, #2
subs height, height, #2
bgt 0b
bgt 0b
epilogue
epilogue
alias_loop_420sp 0
alias_loop_420sp 0
endfunc
.endm
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
/* accumulate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
.macro store_y8_16x1 dst, count
.ifc "\count",""
vstmia \dst!, {y8x16}
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
.macro store_chroma_nv12_8x1 dst, count
.ifc "\count",""
vst2.i8 {u8x8, v8x8}, [\dst]!
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
.macro store_chroma_nv21_8x1 dst, count
.ifc "\count",""
vst2.i8 {v8x8, u8x8}, [\dst]!
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
.macro load_8888_16x1 a, b, c, d, src, count
.ifc "\count",""
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, lsl #2
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, lsl #2
.endif
.endm
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
load_8888_16x1 r, g, b, x, \src, \count
.endm
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
load_8888_16x1 b, g, r, x, \src, \count
.endm
.macro alias_src_rgbx set=1
alias_src_8888 r, g, b, x, \set
alias_src_8888 r, g, b, x, \set
.endm
.macro alias_src_bgrx set=1
alias_src_8888 b, g, r, x, \set
alias_src_8888 b, g, r, x, \set
.endm
.macro alias_dst_nv12 set=1
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
.macro alias_dst_nv21 set=1
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
@@ -259,33 +259,33 @@ alias BIAS_Y, q2
/* q3-q6 R8G8B8X8 x16 */
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
load_\rgb_fmt\()_16x1 \rgb0, \count
load_\rgb_fmt\()_16x1 \rgb0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm
+155 -155
View File
@@ -23,254 +23,254 @@
.macro compute_premult
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
.endm
.macro compute_color dst_comp1 dst_comp2 pre
vadd.s16 q1, q14, \pre
vadd.s16 q2, q15, \pre
vqrshrun.s16 \dst_comp1, q1, #1
vqrshrun.s16 \dst_comp2, q2, #1
vadd.s16 q1, q14, \pre
vadd.s16 q2, q15, \pre
vqrshrun.s16 \dst_comp1, q1, #1
vqrshrun.s16 \dst_comp2, q2, #1
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
compute_color \r1, \r2, q8
compute_color \g1, \g2, q9
compute_color \b1, \b2, q10
vmov.u8 \a1, #255
vmov.u8 \a2, #255
compute_color \r1, \r2, q8
compute_color \g1, \g2, q9
compute_color \b1, \b2, q10
vmov.u8 \a1, #255
vmov.u8 \a2, #255
.endm
.macro compute dst ofmt
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
.ifc \ofmt,argb
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
.endif
.ifc \ofmt,rgba
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
.endif
.ifc \ofmt,abgr
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
.endif
.ifc \ofmt,bgra
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
.endif
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
vst4.8 {q3, q4}, [\dst]!
vst4.8 {q5, q6}, [\dst]!
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
vst4.8 {q3, q4}, [\dst]!
vst4.8 {q5, q6}, [\dst]!
.endm
.macro process_1l_internal dst src ofmt
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
compute \dst, \ofmt
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
compute \dst, \ofmt
.endm
.macro process_1l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
.endm
.macro process_2l ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
process_1l_internal r11,r12,\ofmt
compute_premult
process_1l_internal r2, r4, \ofmt
process_1l_internal r11,r12,\ofmt
.endm
.macro load_args_nv12
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcC
ldr r7, [sp, #116] @ r7 = linesizeC
ldr r8, [sp, #120] @ r8 = table
ldr r9, [sp, #124] @ r9 = y_offset
ldr r10,[sp, #128] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcC
ldr r7, [sp, #116] @ r7 = linesizeC
ldr r8, [sp, #120] @ r8 = table
ldr r9, [sp, #124] @ r9 = y_offset
ldr r10,[sp, #128] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
.endm
.macro load_args_nv21
load_args_nv12
load_args_nv12
.endm
.macro load_args_yuv420p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
ldr r10,[sp, #120] @ r10 = srcV
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
add r11, r2, r3 @ r11 = dst + linesize (dst2)
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
lsl r3, r3, #1
lsl r5, r5, #1
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_args_yuv422p
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r7, [sp, #116] @ r7 = linesizeU
ldr r12,[sp, #124] @ r12 = linesizeV
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
ldr r10,[sp, #120] @ r10 = srcV
push {r4-r12, lr}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r5, [sp, #108] @ r5 = linesizeY
ldr r6, [sp, #112] @ r6 = srcU
ldr r7, [sp, #116] @ r7 = linesizeU
ldr r12,[sp, #124] @ r12 = linesizeV
ldr r8, [sp, #128] @ r8 = table
ldr r9, [sp, #132] @ r9 = y_offset
ldr r10,[sp, #136] @ r10 = y_coeff
vdup.16 d0, r10 @ d0 = y_coeff
vld1.16 {d1}, [r8] @ d1 = *table
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
ldr r10,[sp, #120] @ r10 = srcV
.endm
.macro load_chroma_nv12
pld [r12, #64*3]
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_nv21
pld [r12, #64*3]
pld [r12, #64*3]
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv420p
pld [r10, #64*3]
pld [r12, #64*3]
pld [r10, #64*3]
pld [r12, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro load_chroma_yuv422p
pld [r10, #64*3]
pld [r10, #64*3]
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
vld1.8 d2, [r6]! @ d2: chroma red line
vld1.8 d3, [r10]! @ d3: chroma blue line
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
.endm
.macro increment_and_test_nv12
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
add r6, r6, r7 @ srcC += paddingC
subs r1, r1, #2 @ height -= 2
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
add r6, r6, r7 @ srcC += paddingC
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_nv21
increment_and_test_nv12
increment_and_test_nv12
.endm
.macro increment_and_test_yuv420p
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
ldr r7, [sp, #116] @ r7 = linesizeU
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
add r6, r6, r7 @ srcU += paddingU
ldr r7, [sp, #124] @ r7 = linesizeV
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
add r10, r10, r7 @ srcV += paddingV
subs r1, r1, #2 @ height -= 2
add r11, r11, r3 @ dst2 += padding
add r12, r12, r5 @ srcY2 += paddingY
ldr r7, [sp, #116] @ r7 = linesizeU
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
add r6, r6, r7 @ srcU += paddingU
ldr r7, [sp, #124] @ r7 = linesizeV
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
add r10, r10, r7 @ srcV += paddingV
subs r1, r1, #2 @ height -= 2
.endm
.macro increment_and_test_yuv422p
add r6, r6, r7 @ srcU += paddingU
add r10,r10,r12 @ srcV += paddingV
subs r1, r1, #1 @ height -= 1
add r6, r6, r7 @ srcU += paddingU
add r10,r10,r12 @ srcV += paddingV
subs r1, r1, #1 @ height -= 1
.endm
.macro process_nv12 ofmt
process_2l \ofmt
process_2l \ofmt
.endm
.macro process_nv21 ofmt
process_2l \ofmt
process_2l \ofmt
.endm
.macro process_yuv420p ofmt
process_2l \ofmt
process_2l \ofmt
.endm
.macro process_yuv422p ofmt
process_1l \ofmt
process_1l \ofmt
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
vdup.16 q12, r9 @ q12 = y_offset
vmov d26, d0 @ q13 = y_coeff
vmov d27, d0 @ q13 = y_coeff
load_args_\ifmt
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
vdup.16 q12, r9 @ q12 = y_offset
vmov d26, d0 @ q13 = y_coeff
vmov d27, d0 @ q13 = y_coeff
1:
mov r8, r0 @ r8 = width
mov r8, r0 @ r8 = width
2:
pld [r6, #64*3]
pld [r4, #64*3]
vmov.i8 d10, #128
load_chroma_\ifmt
process_\ifmt \ofmt
subs r8, r8, #16 @ width -= 16
bgt 2b
add r2, r2, r3 @ dst += padding
add r4, r4, r5 @ srcY += paddingY
increment_and_test_\ifmt
bgt 1b
vpop {q4-q7}
pop {r4-r12, pc}
pld [r6, #64*3]
pld [r4, #64*3]
vmov.i8 d10, #128
load_chroma_\ifmt
process_\ifmt \ofmt
subs r8, r8, #16 @ width -= 16
bgt 2b
add r2, r2, r3 @ dst += padding
add r4, r4, r5 @ srcY += paddingY
increment_and_test_\ifmt
bgt 1b
vpop {q4-q7}
pop {r4-r12, pc}
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12