mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-06-11 08:13:06 +00:00
libswscale/arm: Switch consistent indentation to common style
Some of these files aligned instructions to 4/24 columns, while we commonly indent arm/aarch64 assembly to 8/24 columns. Some of these files also used a different alignment for the operands.
This commit is contained in:
+44
-44
@@ -22,48 +22,48 @@
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_hscale_8_to_15_neon, export=1
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ filter
|
||||
ldr r5, [sp, #108] @ filterPos
|
||||
ldr r6, [sp, #112] @ filterSize
|
||||
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
|
||||
1: ldr r8, [r5], #4 @ filterPos[0]
|
||||
ldr r9, [r5], #4 @ filterPos[1]
|
||||
vmov.s32 q4, #0 @ val accumulator
|
||||
vmov.s32 q5, #0 @ val accumulator
|
||||
mov r7, r6 @ tmpfilterSize = filterSize
|
||||
mov r0, r3 @ srcp
|
||||
2: add r11, r0, r8 @ srcp + filterPos[0]
|
||||
add r12, r0, r9 @ srcp + filterPos[1]
|
||||
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
|
||||
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
|
||||
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
|
||||
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
|
||||
vmovl.u8 q0, d0 @ unpack src values to 16-bit
|
||||
vmovl.u8 q1, d2 @ unpack src values to 16-bit
|
||||
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
|
||||
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
|
||||
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
|
||||
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
|
||||
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
|
||||
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
|
||||
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
|
||||
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
|
||||
vadd.s32 q4, q8 @ update val accumulator
|
||||
vadd.s32 q5, q10 @ update val accumulator
|
||||
add r0, #8 @ srcp += 8
|
||||
subs r7, #8 @ tmpfilterSize -= 8
|
||||
bgt 2b @ loop until tmpfilterSize is consumed
|
||||
mov r4, r10 @ filter = filter2
|
||||
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
|
||||
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
|
||||
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
|
||||
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
|
||||
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
|
||||
vst1.32 {d8[0]},[r1]! @ write destination
|
||||
subs r2, #2 @ dstW -= 2
|
||||
bgt 1b @ loop until end of line
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ filter
|
||||
ldr r5, [sp, #108] @ filterPos
|
||||
ldr r6, [sp, #112] @ filterSize
|
||||
add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
|
||||
1: ldr r8, [r5], #4 @ filterPos[0]
|
||||
ldr r9, [r5], #4 @ filterPos[1]
|
||||
vmov.s32 q4, #0 @ val accumulator
|
||||
vmov.s32 q5, #0 @ val accumulator
|
||||
mov r7, r6 @ tmpfilterSize = filterSize
|
||||
mov r0, r3 @ srcp
|
||||
2: add r11, r0, r8 @ srcp + filterPos[0]
|
||||
add r12, r0, r9 @ srcp + filterPos[1]
|
||||
vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
|
||||
vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
|
||||
vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
|
||||
vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
|
||||
vmovl.u8 q0, d0 @ unpack src values to 16-bit
|
||||
vmovl.u8 q1, d2 @ unpack src values to 16-bit
|
||||
vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
|
||||
vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
|
||||
vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
|
||||
vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
|
||||
vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
|
||||
vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
|
||||
vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
|
||||
vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
|
||||
vadd.s32 q4, q8 @ update val accumulator
|
||||
vadd.s32 q5, q10 @ update val accumulator
|
||||
add r0, #8 @ srcp += 8
|
||||
subs r7, #8 @ tmpfilterSize -= 8
|
||||
bgt 2b @ loop until tmpfilterSize is consumed
|
||||
mov r4, r10 @ filter = filter2
|
||||
add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
|
||||
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
|
||||
vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
|
||||
vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
|
||||
vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
|
||||
vst1.32 {d8[0]},[r1]! @ write destination
|
||||
subs r2, #2 @ dstW -= 2
|
||||
bgt 1b @ loop until end of line
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
endfunc
|
||||
|
||||
+52
-52
@@ -22,56 +22,56 @@
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_yuv2planeX_8_neon, export=1
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ dstW
|
||||
ldr r5, [sp, #108] @ dither
|
||||
ldr r6, [sp, #112] @ offset
|
||||
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
|
||||
cmp r6, #0 @ check offsetting which can be 0 or 3 only
|
||||
beq 1f
|
||||
vext.u8 d0, d0, d0, #3 @ honor offsetting which can be 3 only
|
||||
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
|
||||
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
|
||||
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
|
||||
mov r7, #0 @ i = 0
|
||||
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
|
||||
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
|
||||
mov r8, r1 @ tmpFilterSize = filterSize
|
||||
mov r9, r2 @ srcp
|
||||
mov r10, r0 @ filterp
|
||||
3: ldr r11, [r9], #4 @ get pointer @ src[j]
|
||||
ldr r12, [r9], #4 @ get pointer @ src[j+1]
|
||||
add r11, r11, r7, lsl #1 @ &src[j][i]
|
||||
add r12, r12, r7, lsl #1 @ &src[j+1][i]
|
||||
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
|
||||
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
|
||||
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
|
||||
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
|
||||
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
|
||||
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
|
||||
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
|
||||
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
|
||||
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
|
||||
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
|
||||
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
|
||||
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
|
||||
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
|
||||
vadd.s32 q3, q5 @ update val accumulator (part 1)
|
||||
vadd.s32 q4, q6 @ update val accumulator (part 2)
|
||||
subs r8, #2 @ tmpFilterSize -= 2
|
||||
bgt 3b @ loop until filterSize is consumed
|
||||
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
|
||||
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
|
||||
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
|
||||
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
|
||||
vqmovn.u16 d6, q3 @ merge part 1 and part 2
|
||||
vst1.8 {d6}, [r3]! @ write destination
|
||||
add r7, #8 @ i += 8
|
||||
subs r4, r4, #8 @ dstW -= 8
|
||||
bgt 2b @ loop until width is consumed
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ dstW
|
||||
ldr r5, [sp, #108] @ dither
|
||||
ldr r6, [sp, #112] @ offset
|
||||
vld1.8 {d0}, [r5] @ load 8x8-bit dither values
|
||||
cmp r6, #0 @ check offsetting which can be 0 or 3 only
|
||||
beq 1f
|
||||
vext.u8 d0, d0, d0, #3 @ honor offsetting which can be 3 only
|
||||
1: vmovl.u8 q0, d0 @ extend dither to 16-bit
|
||||
vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
|
||||
vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
|
||||
mov r7, #0 @ i = 0
|
||||
2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
|
||||
vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
|
||||
mov r8, r1 @ tmpFilterSize = filterSize
|
||||
mov r9, r2 @ srcp
|
||||
mov r10, r0 @ filterp
|
||||
3: ldr r11, [r9], #4 @ get pointer @ src[j]
|
||||
ldr r12, [r9], #4 @ get pointer @ src[j+1]
|
||||
add r11, r11, r7, lsl #1 @ &src[j][i]
|
||||
add r12, r12, r7, lsl #1 @ &src[j+1][i]
|
||||
vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
|
||||
vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
|
||||
ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
|
||||
vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
|
||||
vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
|
||||
vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
|
||||
vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
|
||||
vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
|
||||
vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
|
||||
vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
|
||||
vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
|
||||
vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
|
||||
vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
|
||||
vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
|
||||
vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
|
||||
vadd.s32 q3, q5 @ update val accumulator (part 1)
|
||||
vadd.s32 q4, q6 @ update val accumulator (part 2)
|
||||
subs r8, #2 @ tmpFilterSize -= 2
|
||||
bgt 3b @ loop until filterSize is consumed
|
||||
vshr.s32 q3, q3, #19 @ val>>19 (part 1)
|
||||
vshr.s32 q4, q4, #19 @ val>>19 (part 2)
|
||||
vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
|
||||
vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
|
||||
vqmovn.u16 d6, q3 @ merge part 1 and part 2
|
||||
vst1.8 {d6}, [r3]! @ write destination
|
||||
add r7, #8 @ i += 8
|
||||
subs r4, r4, #8 @ dstW -= 8
|
||||
bgt 2b @ loop until width is consumed
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
endfunc
|
||||
|
||||
@@ -36,34 +36,34 @@ alias y16x16_h, q14
|
||||
alias_qw y8x16, q15
|
||||
|
||||
.macro init src
|
||||
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
|
||||
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
|
||||
vrshrn.i32 CO_R, q13, #7
|
||||
vrshrn.i32 CO_G, q14, #7
|
||||
vrshrn.i32 CO_B, q15, #7
|
||||
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
|
||||
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
|
||||
vrshrn.i32 CO_R, q13, #7
|
||||
vrshrn.i32 CO_G, q14, #7
|
||||
vrshrn.i32 CO_B, q15, #7
|
||||
|
||||
vmov.u8 BIAS_Y, #16
|
||||
vmov.u8 BIAS_U, #128
|
||||
vmov.u8 BIAS_Y, #16
|
||||
vmov.u8 BIAS_U, #128
|
||||
.endm
|
||||
|
||||
|
||||
.macro compute_y_16x1_step action, s8x16, coeff
|
||||
vmovl.u8 n16x16_l, \s8x16\()_l
|
||||
vmovl.u8 n16x16_h, \s8x16\()_h
|
||||
vmovl.u8 n16x16_l, \s8x16\()_l
|
||||
vmovl.u8 n16x16_h, \s8x16\()_h
|
||||
|
||||
\action y16x16_l, n16x16_l, \coeff
|
||||
\action y16x16_h, n16x16_h, \coeff
|
||||
\action y16x16_l, n16x16_l, \coeff
|
||||
\action y16x16_h, n16x16_h, \coeff
|
||||
.endm
|
||||
|
||||
.macro compute_y_16x1
|
||||
compute_y_16x1_step vmul, r8x16, CO_RY
|
||||
compute_y_16x1_step vmla, g8x16, CO_GY
|
||||
compute_y_16x1_step vmla, b8x16, CO_BY
|
||||
compute_y_16x1_step vmul, r8x16, CO_RY
|
||||
compute_y_16x1_step vmla, g8x16, CO_GY
|
||||
compute_y_16x1_step vmla, b8x16, CO_BY
|
||||
|
||||
vrshrn.i16 y8x16_l, y16x16_l, #8
|
||||
vrshrn.i16 y8x16_h, y16x16_h, #8
|
||||
vrshrn.i16 y8x16_l, y16x16_l, #8
|
||||
vrshrn.i16 y8x16_h, y16x16_h, #8
|
||||
|
||||
vadd.u8 y8x16, y8x16, BIAS_Y
|
||||
vadd.u8 y8x16, y8x16, BIAS_Y
|
||||
.endm
|
||||
|
||||
alias c16x8, q15
|
||||
@@ -71,13 +71,13 @@ alias_qw c8x8x2, q10
|
||||
|
||||
|
||||
.macro compute_chroma_8x1 c, C
|
||||
vmul c16x8, r16x8, CO_R\C
|
||||
vmla c16x8, g16x8, CO_G\C
|
||||
vmla c16x8, b16x8, CO_B\C
|
||||
vmul c16x8, r16x8, CO_R\C
|
||||
vmla c16x8, g16x8, CO_G\C
|
||||
vmla c16x8, b16x8, CO_B\C
|
||||
|
||||
vrshrn.i16 \c\()8x8, c16x8, #8
|
||||
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
|
||||
vrshrn.i16 \c\()8x8, c16x8, #8
|
||||
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
|
||||
.endm
|
||||
|
||||
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
|
||||
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
|
||||
#endif
|
||||
|
||||
@@ -48,27 +48,27 @@ alias y8x16, y16x16_e
|
||||
|
||||
|
||||
.macro init src
|
||||
// load s32x3x3, narrow to s16x3x3
|
||||
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
|
||||
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
|
||||
// load s32x3x3, narrow to s16x3x3
|
||||
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
|
||||
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
|
||||
|
||||
vmovn.i32 CO_R, q13
|
||||
vmovn.i32 CO_G, q14
|
||||
vmovn.i32 CO_B, q15
|
||||
vmovn.i32 CO_R, q13
|
||||
vmovn.i32 CO_G, q14
|
||||
vmovn.i32 CO_B, q15
|
||||
|
||||
vmov.u8 BIAS_Y, #16
|
||||
vmov.u8 BIAS_U, #128
|
||||
vmov.u8 BIAS_Y, #16
|
||||
vmov.u8 BIAS_U, #128
|
||||
.endm
|
||||
|
||||
|
||||
.macro compute_y_16x1_step action, s8x16, coeff
|
||||
vmov.u8 n16x16_o, #0
|
||||
vtrn.u8 \s8x16, n16x16_o
|
||||
vmov.u8 n16x16_o, #0
|
||||
vtrn.u8 \s8x16, n16x16_o
|
||||
|
||||
\action y32x16_el, \s8x16\()_l, \coeff
|
||||
\action y32x16_eh, \s8x16\()_h, \coeff
|
||||
\action y32x16_ol, n16x16_ol, \coeff
|
||||
\action y32x16_oh, n16x16_oh, \coeff
|
||||
\action y32x16_el, \s8x16\()_l, \coeff
|
||||
\action y32x16_eh, \s8x16\()_h, \coeff
|
||||
\action y32x16_ol, n16x16_ol, \coeff
|
||||
\action y32x16_oh, n16x16_oh, \coeff
|
||||
.endm
|
||||
|
||||
/*
|
||||
@@ -77,17 +77,17 @@ alias y8x16, y16x16_e
|
||||
* clobber: q11-q15, r8x16, g8x16, b8x16
|
||||
*/
|
||||
.macro compute_y_16x1
|
||||
compute_y_16x1_step vmull, r8x16, CO_RY
|
||||
compute_y_16x1_step vmlal, g8x16, CO_GY
|
||||
compute_y_16x1_step vmlal, b8x16, CO_BY
|
||||
compute_y_16x1_step vmull, r8x16, CO_RY
|
||||
compute_y_16x1_step vmlal, g8x16, CO_GY
|
||||
compute_y_16x1_step vmlal, b8x16, CO_BY
|
||||
|
||||
vrshrn.i32 y16x16_el, y32x16_el, #15
|
||||
vrshrn.i32 y16x16_eh, y32x16_eh, #15
|
||||
vrshrn.i32 y16x16_ol, y32x16_ol, #15
|
||||
vrshrn.i32 y16x16_oh, y32x16_oh, #15
|
||||
vrshrn.i32 y16x16_el, y32x16_el, #15
|
||||
vrshrn.i32 y16x16_eh, y32x16_eh, #15
|
||||
vrshrn.i32 y16x16_ol, y32x16_ol, #15
|
||||
vrshrn.i32 y16x16_oh, y32x16_oh, #15
|
||||
|
||||
vtrn.8 y16x16_e, y16x16_o
|
||||
vadd.u8 y8x16, y8x16, BIAS_Y
|
||||
vtrn.8 y16x16_e, y16x16_o
|
||||
vadd.u8 y8x16, y8x16, BIAS_Y
|
||||
.endm
|
||||
|
||||
alias c32x8_l, q14
|
||||
@@ -97,8 +97,8 @@ alias_qw c16x8, q13
|
||||
alias_qw c8x8x2, q10
|
||||
|
||||
.macro compute_chroma_8x1_step action, s16x8, coeff
|
||||
\action c32x8_l, \s16x8\()_l, \coeff
|
||||
\action c32x8_h, \s16x8\()_h, \coeff
|
||||
\action c32x8_l, \s16x8\()_l, \coeff
|
||||
\action c32x8_h, \s16x8\()_h, \coeff
|
||||
.endm
|
||||
|
||||
/*
|
||||
@@ -107,16 +107,16 @@ alias_qw c8x8x2, q10
|
||||
* clobber: q14-q15
|
||||
*/
|
||||
.macro compute_chroma_8x1 c, C
|
||||
compute_chroma_8x1_step vmull, r16x8, CO_R\C
|
||||
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
|
||||
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
|
||||
compute_chroma_8x1_step vmull, r16x8, CO_R\C
|
||||
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
|
||||
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
|
||||
|
||||
vrshrn.i32 c16x8_l, c32x8_l, #15
|
||||
vrshrn.i32 c16x8_h, c32x8_h, #15
|
||||
vmovn.i16 \c\()8x8, c16x8
|
||||
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
|
||||
vrshrn.i32 c16x8_l, c32x8_l, #15
|
||||
vrshrn.i32 c16x8_h, c32x8_h, #15
|
||||
vmovn.i16 \c\()8x8, c16x8
|
||||
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
|
||||
.endm
|
||||
|
||||
|
||||
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
|
||||
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
|
||||
#endif
|
||||
|
||||
@@ -31,10 +31,10 @@
|
||||
.altmacro
|
||||
|
||||
.macro alias_dw_all qw, dw_l, dw_h
|
||||
alias q\qw\()_l, d\dw_l
|
||||
alias q\qw\()_h, d\dw_h
|
||||
alias q\qw\()_l, d\dw_l
|
||||
alias q\qw\()_h, d\dw_h
|
||||
.if \qw < 15
|
||||
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
|
||||
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
@@ -43,23 +43,23 @@ alias_dw_all 0, 0, 1
|
||||
.noaltmacro
|
||||
|
||||
.macro alias_qw name, qw, set=1
|
||||
alias \name\(), \qw, \set
|
||||
alias \name\()_l, \qw\()_l, \set
|
||||
alias \name\()_h, \qw\()_h, \set
|
||||
alias \name\(), \qw, \set
|
||||
alias \name\()_l, \qw\()_l, \set
|
||||
alias \name\()_h, \qw\()_h, \set
|
||||
.endm
|
||||
|
||||
.macro prologue
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
.endm
|
||||
|
||||
.macro epilogue
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
.endm
|
||||
|
||||
.macro load_arg reg, ix
|
||||
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
|
||||
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
|
||||
.endm
|
||||
|
||||
|
||||
@@ -69,167 +69,167 @@ alias_dw_all 0, 0, 1
|
||||
* int32_t coeff_table[9]);
|
||||
*/
|
||||
.macro alias_loop_420sp set=1
|
||||
alias src, r0, \set
|
||||
alias src0, src, \set
|
||||
alias y, r1, \set
|
||||
alias y0, y, \set
|
||||
alias chroma, r2, \set
|
||||
alias width, r3, \set
|
||||
alias header, width, \set
|
||||
alias src, r0, \set
|
||||
alias src0, src, \set
|
||||
alias y, r1, \set
|
||||
alias y0, y, \set
|
||||
alias chroma, r2, \set
|
||||
alias width, r3, \set
|
||||
alias header, width, \set
|
||||
|
||||
alias height, r4, \set
|
||||
alias y_stride, r5, \set
|
||||
alias c_stride, r6, \set
|
||||
alias c_padding, c_stride, \set
|
||||
alias src_stride, r7, \set
|
||||
alias height, r4, \set
|
||||
alias y_stride, r5, \set
|
||||
alias c_stride, r6, \set
|
||||
alias c_padding, c_stride, \set
|
||||
alias src_stride, r7, \set
|
||||
|
||||
alias y0_end, r8, \set
|
||||
alias y0_end, r8, \set
|
||||
|
||||
alias src_padding,r9, \set
|
||||
alias y_padding, r10, \set
|
||||
alias src_padding,r9, \set
|
||||
alias y_padding, r10, \set
|
||||
|
||||
alias src1, r11, \set
|
||||
alias y1, r12, \set
|
||||
alias src1, r11, \set
|
||||
alias y1, r12, \set
|
||||
|
||||
alias coeff_table,r12, \set
|
||||
alias coeff_table,r12, \set
|
||||
.endm
|
||||
|
||||
|
||||
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
|
||||
|
||||
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
|
||||
prologue
|
||||
prologue
|
||||
|
||||
alias_loop_420sp
|
||||
alias_loop_420sp
|
||||
|
||||
load_arg height, 4
|
||||
load_arg y_stride, 5
|
||||
load_arg c_stride, 6
|
||||
load_arg src_stride, 7
|
||||
load_arg coeff_table, 8
|
||||
load_arg height, 4
|
||||
load_arg y_stride, 5
|
||||
load_arg c_stride, 6
|
||||
load_arg src_stride, 7
|
||||
load_arg coeff_table, 8
|
||||
|
||||
\init coeff_table
|
||||
\init coeff_table
|
||||
|
||||
sub y_padding, y_stride, width
|
||||
sub c_padding, c_stride, width
|
||||
sub src_padding, src_stride, width, lsl #2
|
||||
sub y_padding, y_stride, width
|
||||
sub c_padding, c_stride, width
|
||||
sub src_padding, src_stride, width, lsl #2
|
||||
|
||||
add y0_end, y0, width
|
||||
and header, width, #15
|
||||
add y0_end, y0, width
|
||||
and header, width, #15
|
||||
|
||||
add y1, y0, y_stride
|
||||
add src1, src0, src_stride
|
||||
add y1, y0, y_stride
|
||||
add src1, src0, src_stride
|
||||
|
||||
0:
|
||||
cmp header, #0
|
||||
beq 1f
|
||||
cmp header, #0
|
||||
beq 1f
|
||||
|
||||
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
|
||||
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
|
||||
|
||||
1:
|
||||
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
|
||||
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
|
||||
|
||||
cmp y0, y0_end
|
||||
blt 1b
|
||||
cmp y0, y0_end
|
||||
blt 1b
|
||||
2:
|
||||
add y0, y1, y_padding
|
||||
add y0_end, y1, y_stride
|
||||
add chroma, chroma, c_padding
|
||||
add src0, src1, src_padding
|
||||
add y0, y1, y_padding
|
||||
add y0_end, y1, y_stride
|
||||
add chroma, chroma, c_padding
|
||||
add src0, src1, src_padding
|
||||
|
||||
add y1, y0, y_stride
|
||||
add src1, src0, src_stride
|
||||
add y1, y0, y_stride
|
||||
add src1, src0, src_stride
|
||||
|
||||
subs height, height, #2
|
||||
subs height, height, #2
|
||||
|
||||
bgt 0b
|
||||
bgt 0b
|
||||
|
||||
epilogue
|
||||
epilogue
|
||||
|
||||
alias_loop_420sp 0
|
||||
alias_loop_420sp 0
|
||||
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro downsample
|
||||
vpaddl.u8 r16x8, r8x16
|
||||
vpaddl.u8 g16x8, g8x16
|
||||
vpaddl.u8 b16x8, b8x16
|
||||
vpaddl.u8 r16x8, r8x16
|
||||
vpaddl.u8 g16x8, g8x16
|
||||
vpaddl.u8 b16x8, b8x16
|
||||
.endm
|
||||
|
||||
|
||||
/* accumulate and right shift by 2 */
|
||||
.macro downsample_ars2
|
||||
vpadal.u8 r16x8, r8x16
|
||||
vpadal.u8 g16x8, g8x16
|
||||
vpadal.u8 b16x8, b8x16
|
||||
vpadal.u8 r16x8, r8x16
|
||||
vpadal.u8 g16x8, g8x16
|
||||
vpadal.u8 b16x8, b8x16
|
||||
|
||||
vrshr.u16 r16x8, r16x8, #2
|
||||
vrshr.u16 g16x8, g16x8, #2
|
||||
vrshr.u16 b16x8, b16x8, #2
|
||||
vrshr.u16 r16x8, r16x8, #2
|
||||
vrshr.u16 g16x8, g16x8, #2
|
||||
vrshr.u16 b16x8, b16x8, #2
|
||||
.endm
|
||||
|
||||
.macro store_y8_16x1 dst, count
|
||||
.ifc "\count",""
|
||||
vstmia \dst!, {y8x16}
|
||||
vstmia \dst!, {y8x16}
|
||||
.else
|
||||
vstmia \dst, {y8x16}
|
||||
add \dst, \dst, \count
|
||||
vstmia \dst, {y8x16}
|
||||
add \dst, \dst, \count
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro store_chroma_nv12_8x1 dst, count
|
||||
.ifc "\count",""
|
||||
vst2.i8 {u8x8, v8x8}, [\dst]!
|
||||
vst2.i8 {u8x8, v8x8}, [\dst]!
|
||||
.else
|
||||
vst2.i8 {u8x8, v8x8}, [\dst], \count
|
||||
vst2.i8 {u8x8, v8x8}, [\dst], \count
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro store_chroma_nv21_8x1 dst, count
|
||||
.ifc "\count",""
|
||||
vst2.i8 {v8x8, u8x8}, [\dst]!
|
||||
vst2.i8 {v8x8, u8x8}, [\dst]!
|
||||
.else
|
||||
vst2.i8 {v8x8, u8x8}, [\dst], \count
|
||||
vst2.i8 {v8x8, u8x8}, [\dst], \count
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_8888_16x1 a, b, c, d, src, count
|
||||
.ifc "\count",""
|
||||
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
|
||||
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
|
||||
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
|
||||
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
|
||||
.else
|
||||
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
|
||||
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
|
||||
sub \src, \src, #32
|
||||
add \src, \src, \count, lsl #2
|
||||
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
|
||||
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
|
||||
sub \src, \src, #32
|
||||
add \src, \src, \count, lsl #2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_rgbx_16x1 src, count
|
||||
load_8888_16x1 r, g, b, x, \src, \count
|
||||
load_8888_16x1 r, g, b, x, \src, \count
|
||||
.endm
|
||||
|
||||
.macro load_bgrx_16x1 src, count
|
||||
load_8888_16x1 b, g, r, x, \src, \count
|
||||
load_8888_16x1 b, g, r, x, \src, \count
|
||||
.endm
|
||||
|
||||
.macro alias_src_rgbx set=1
|
||||
alias_src_8888 r, g, b, x, \set
|
||||
alias_src_8888 r, g, b, x, \set
|
||||
.endm
|
||||
|
||||
.macro alias_src_bgrx set=1
|
||||
alias_src_8888 b, g, r, x, \set
|
||||
alias_src_8888 b, g, r, x, \set
|
||||
.endm
|
||||
|
||||
.macro alias_dst_nv12 set=1
|
||||
alias u8x8, c8x8x2_l, \set
|
||||
alias v8x8, c8x8x2_h, \set
|
||||
alias u8x8, c8x8x2_l, \set
|
||||
alias v8x8, c8x8x2_h, \set
|
||||
.endm
|
||||
|
||||
.macro alias_dst_nv21 set=1
|
||||
alias v8x8, c8x8x2_l, \set
|
||||
alias u8x8, c8x8x2_h, \set
|
||||
alias v8x8, c8x8x2_l, \set
|
||||
alias u8x8, c8x8x2_h, \set
|
||||
.endm
|
||||
|
||||
|
||||
@@ -259,33 +259,33 @@ alias BIAS_Y, q2
|
||||
/* q3-q6 R8G8B8X8 x16 */
|
||||
|
||||
.macro alias_src_8888 a, b, c, d, set
|
||||
alias_qw \a\()8x16, q3, \set
|
||||
alias_qw \b\()8x16, q4, \set
|
||||
alias_qw \c\()8x16, q5, \set
|
||||
alias_qw \d\()8x16, q6, \set
|
||||
alias_qw \a\()8x16, q3, \set
|
||||
alias_qw \b\()8x16, q4, \set
|
||||
alias_qw \c\()8x16, q5, \set
|
||||
alias_qw \d\()8x16, q6, \set
|
||||
.endm
|
||||
|
||||
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
|
||||
alias_src_\rgb_fmt
|
||||
alias_dst_\yuv_fmt
|
||||
alias_src_\rgb_fmt
|
||||
alias_dst_\yuv_fmt
|
||||
|
||||
load_\rgb_fmt\()_16x1 \rgb0, \count
|
||||
load_\rgb_fmt\()_16x1 \rgb0, \count
|
||||
|
||||
downsample
|
||||
compute_y_16x1
|
||||
store_y8_16x1 \y0, \count
|
||||
downsample
|
||||
compute_y_16x1
|
||||
store_y8_16x1 \y0, \count
|
||||
|
||||
|
||||
load_\rgb_fmt\()_16x1 \rgb1, \count
|
||||
downsample_ars2
|
||||
compute_y_16x1
|
||||
store_y8_16x1 \y1, \count
|
||||
load_\rgb_fmt\()_16x1 \rgb1, \count
|
||||
downsample_ars2
|
||||
compute_y_16x1
|
||||
store_y8_16x1 \y1, \count
|
||||
|
||||
compute_chroma_8x1 u, U
|
||||
compute_chroma_8x1 v, V
|
||||
compute_chroma_8x1 u, U
|
||||
compute_chroma_8x1 v, V
|
||||
|
||||
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
|
||||
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
|
||||
|
||||
alias_dst_\yuv_fmt 0
|
||||
alias_src_\rgb_fmt 0
|
||||
alias_dst_\yuv_fmt 0
|
||||
alias_src_\rgb_fmt 0
|
||||
.endm
|
||||
|
||||
+155
-155
@@ -23,254 +23,254 @@
|
||||
|
||||
|
||||
.macro compute_premult
|
||||
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
|
||||
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
|
||||
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
|
||||
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
|
||||
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
|
||||
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
|
||||
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
|
||||
vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
|
||||
vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
|
||||
vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
|
||||
vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
|
||||
vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
|
||||
vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
|
||||
vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
|
||||
.endm
|
||||
|
||||
.macro compute_color dst_comp1 dst_comp2 pre
|
||||
vadd.s16 q1, q14, \pre
|
||||
vadd.s16 q2, q15, \pre
|
||||
vqrshrun.s16 \dst_comp1, q1, #1
|
||||
vqrshrun.s16 \dst_comp2, q2, #1
|
||||
vadd.s16 q1, q14, \pre
|
||||
vadd.s16 q2, q15, \pre
|
||||
vqrshrun.s16 \dst_comp1, q1, #1
|
||||
vqrshrun.s16 \dst_comp2, q2, #1
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
compute_color \r1, \r2, q8
|
||||
compute_color \g1, \g2, q9
|
||||
compute_color \b1, \b2, q10
|
||||
vmov.u8 \a1, #255
|
||||
vmov.u8 \a2, #255
|
||||
compute_color \r1, \r2, q8
|
||||
compute_color \g1, \g2, q9
|
||||
compute_color \b1, \b2, q10
|
||||
vmov.u8 \a1, #255
|
||||
vmov.u8 \a2, #255
|
||||
.endm
|
||||
|
||||
.macro compute dst ofmt
|
||||
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
|
||||
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
|
||||
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
|
||||
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
|
||||
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
|
||||
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
|
||||
vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
|
||||
vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
|
||||
vsub.s16 q14, q12 @ q14 = (Y - y_offset)
|
||||
vsub.s16 q15, q12 @ q15 = (Y - y_offset)
|
||||
vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
|
||||
vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
|
||||
|
||||
.ifc \ofmt,argb
|
||||
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
|
||||
compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,rgba
|
||||
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
|
||||
compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,abgr
|
||||
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
|
||||
compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,bgra
|
||||
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
|
||||
compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
|
||||
.endif
|
||||
|
||||
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
|
||||
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
|
||||
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
|
||||
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
|
||||
vst4.8 {q3, q4}, [\dst]!
|
||||
vst4.8 {q5, q6}, [\dst]!
|
||||
vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
|
||||
vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
|
||||
vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
|
||||
vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
|
||||
vst4.8 {q3, q4}, [\dst]!
|
||||
vst4.8 {q5, q6}, [\dst]!
|
||||
.endm
|
||||
|
||||
.macro process_1l_internal dst src ofmt
|
||||
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
|
||||
compute \dst, \ofmt
|
||||
vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
|
||||
compute \dst, \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_1l ofmt
|
||||
compute_premult
|
||||
process_1l_internal r2, r4, \ofmt
|
||||
compute_premult
|
||||
process_1l_internal r2, r4, \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_2l ofmt
|
||||
compute_premult
|
||||
process_1l_internal r2, r4, \ofmt
|
||||
process_1l_internal r11,r12,\ofmt
|
||||
compute_premult
|
||||
process_1l_internal r2, r4, \ofmt
|
||||
process_1l_internal r11,r12,\ofmt
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcC
|
||||
ldr r7, [sp, #116] @ r7 = linesizeC
|
||||
ldr r8, [sp, #120] @ r8 = table
|
||||
ldr r9, [sp, #124] @ r9 = y_offset
|
||||
ldr r10,[sp, #128] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
add r11, r2, r3 @ r11 = dst + linesize (dst2)
|
||||
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
|
||||
lsl r3, r3, #1
|
||||
lsl r5, r5, #1
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
|
||||
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcC
|
||||
ldr r7, [sp, #116] @ r7 = linesizeC
|
||||
ldr r8, [sp, #120] @ r8 = table
|
||||
ldr r9, [sp, #124] @ r9 = y_offset
|
||||
ldr r10,[sp, #128] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
add r11, r2, r3 @ r11 = dst + linesize (dst2)
|
||||
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
|
||||
lsl r3, r3, #1
|
||||
lsl r5, r5, #1
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
|
||||
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
|
||||
.endm
|
||||
|
||||
.macro load_args_nv21
|
||||
load_args_nv12
|
||||
load_args_nv12
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv420p
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcU
|
||||
ldr r8, [sp, #128] @ r8 = table
|
||||
ldr r9, [sp, #132] @ r9 = y_offset
|
||||
ldr r10,[sp, #136] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
add r11, r2, r3 @ r11 = dst + linesize (dst2)
|
||||
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
|
||||
lsl r3, r3, #1
|
||||
lsl r5, r5, #1
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
|
||||
ldr r10,[sp, #120] @ r10 = srcV
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcU
|
||||
ldr r8, [sp, #128] @ r8 = table
|
||||
ldr r9, [sp, #132] @ r9 = y_offset
|
||||
ldr r10,[sp, #136] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
add r11, r2, r3 @ r11 = dst + linesize (dst2)
|
||||
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
|
||||
lsl r3, r3, #1
|
||||
lsl r5, r5, #1
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
|
||||
ldr r10,[sp, #120] @ r10 = srcV
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv422p
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcU
|
||||
ldr r7, [sp, #116] @ r7 = linesizeU
|
||||
ldr r12,[sp, #124] @ r12 = linesizeV
|
||||
ldr r8, [sp, #128] @ r8 = table
|
||||
ldr r9, [sp, #132] @ r9 = y_offset
|
||||
ldr r10,[sp, #136] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
|
||||
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
|
||||
ldr r10,[sp, #120] @ r10 = srcV
|
||||
push {r4-r12, lr}
|
||||
vpush {q4-q7}
|
||||
ldr r4, [sp, #104] @ r4 = srcY
|
||||
ldr r5, [sp, #108] @ r5 = linesizeY
|
||||
ldr r6, [sp, #112] @ r6 = srcU
|
||||
ldr r7, [sp, #116] @ r7 = linesizeU
|
||||
ldr r12,[sp, #124] @ r12 = linesizeV
|
||||
ldr r8, [sp, #128] @ r8 = table
|
||||
ldr r9, [sp, #132] @ r9 = y_offset
|
||||
ldr r10,[sp, #136] @ r10 = y_coeff
|
||||
vdup.16 d0, r10 @ d0 = y_coeff
|
||||
vld1.16 {d1}, [r8] @ d1 = *table
|
||||
sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
|
||||
sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
|
||||
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
|
||||
ldr r10,[sp, #120] @ r10 = srcV
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv12
|
||||
pld [r12, #64*3]
|
||||
pld [r12, #64*3]
|
||||
|
||||
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_nv21
|
||||
pld [r12, #64*3]
|
||||
pld [r12, #64*3]
|
||||
|
||||
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
||||
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
|
||||
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
||||
vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv420p
|
||||
pld [r10, #64*3]
|
||||
pld [r12, #64*3]
|
||||
pld [r10, #64*3]
|
||||
pld [r12, #64*3]
|
||||
|
||||
vld1.8 d2, [r6]! @ d2: chroma red line
|
||||
vld1.8 d3, [r10]! @ d3: chroma blue line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
vld1.8 d2, [r6]! @ d2: chroma red line
|
||||
vld1.8 d3, [r10]! @ d3: chroma blue line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro load_chroma_yuv422p
|
||||
pld [r10, #64*3]
|
||||
pld [r10, #64*3]
|
||||
|
||||
vld1.8 d2, [r6]! @ d2: chroma red line
|
||||
vld1.8 d3, [r10]! @ d3: chroma blue line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
vld1.8 d2, [r6]! @ d2: chroma red line
|
||||
vld1.8 d3, [r10]! @ d3: chroma blue line
|
||||
vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
|
||||
vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_nv12
|
||||
add r11, r11, r3 @ dst2 += padding
|
||||
add r12, r12, r5 @ srcY2 += paddingY
|
||||
add r6, r6, r7 @ srcC += paddingC
|
||||
subs r1, r1, #2 @ height -= 2
|
||||
add r11, r11, r3 @ dst2 += padding
|
||||
add r12, r12, r5 @ srcY2 += paddingY
|
||||
add r6, r6, r7 @ srcC += paddingC
|
||||
subs r1, r1, #2 @ height -= 2
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_nv21
|
||||
increment_and_test_nv12
|
||||
increment_and_test_nv12
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_yuv420p
|
||||
add r11, r11, r3 @ dst2 += padding
|
||||
add r12, r12, r5 @ srcY2 += paddingY
|
||||
ldr r7, [sp, #116] @ r7 = linesizeU
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
|
||||
add r6, r6, r7 @ srcU += paddingU
|
||||
ldr r7, [sp, #124] @ r7 = linesizeV
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
|
||||
add r10, r10, r7 @ srcV += paddingV
|
||||
subs r1, r1, #2 @ height -= 2
|
||||
add r11, r11, r3 @ dst2 += padding
|
||||
add r12, r12, r5 @ srcY2 += paddingY
|
||||
ldr r7, [sp, #116] @ r7 = linesizeU
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
|
||||
add r6, r6, r7 @ srcU += paddingU
|
||||
ldr r7, [sp, #124] @ r7 = linesizeV
|
||||
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
|
||||
add r10, r10, r7 @ srcV += paddingV
|
||||
subs r1, r1, #2 @ height -= 2
|
||||
.endm
|
||||
|
||||
.macro increment_and_test_yuv422p
|
||||
add r6, r6, r7 @ srcU += paddingU
|
||||
add r10,r10,r12 @ srcV += paddingV
|
||||
subs r1, r1, #1 @ height -= 1
|
||||
add r6, r6, r7 @ srcU += paddingU
|
||||
add r10,r10,r12 @ srcV += paddingV
|
||||
subs r1, r1, #1 @ height -= 1
|
||||
.endm
|
||||
|
||||
.macro process_nv12 ofmt
|
||||
process_2l \ofmt
|
||||
process_2l \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_nv21 ofmt
|
||||
process_2l \ofmt
|
||||
process_2l \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_yuv420p ofmt
|
||||
process_2l \ofmt
|
||||
process_2l \ofmt
|
||||
.endm
|
||||
|
||||
.macro process_yuv422p ofmt
|
||||
process_1l \ofmt
|
||||
process_1l \ofmt
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt
|
||||
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
|
||||
vdup.16 q12, r9 @ q12 = y_offset
|
||||
vmov d26, d0 @ q13 = y_coeff
|
||||
vmov d27, d0 @ q13 = y_coeff
|
||||
load_args_\ifmt
|
||||
vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
|
||||
vdup.16 q12, r9 @ q12 = y_offset
|
||||
vmov d26, d0 @ q13 = y_coeff
|
||||
vmov d27, d0 @ q13 = y_coeff
|
||||
1:
|
||||
mov r8, r0 @ r8 = width
|
||||
mov r8, r0 @ r8 = width
|
||||
2:
|
||||
pld [r6, #64*3]
|
||||
pld [r4, #64*3]
|
||||
vmov.i8 d10, #128
|
||||
load_chroma_\ifmt
|
||||
process_\ifmt \ofmt
|
||||
subs r8, r8, #16 @ width -= 16
|
||||
bgt 2b
|
||||
add r2, r2, r3 @ dst += padding
|
||||
add r4, r4, r5 @ srcY += paddingY
|
||||
increment_and_test_\ifmt
|
||||
bgt 1b
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
pld [r6, #64*3]
|
||||
pld [r4, #64*3]
|
||||
vmov.i8 d10, #128
|
||||
load_chroma_\ifmt
|
||||
process_\ifmt \ofmt
|
||||
subs r8, r8, #16 @ width -= 16
|
||||
bgt 2b
|
||||
add r2, r2, r3 @ dst += padding
|
||||
add r4, r4, r5 @ srcY += paddingY
|
||||
increment_and_test_\ifmt
|
||||
bgt 1b
|
||||
vpop {q4-q7}
|
||||
pop {r4-r12, pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro declare_rgb_funcs ifmt
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
declare_func \ifmt, argb
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
.endm
|
||||
|
||||
declare_rgb_funcs nv12
|
||||
|
||||
Reference in New Issue
Block a user