x86: add SSSE3 versions for filmgrain.fguv_32x32xn[422/444]

fguv_32x32xn_8bpc_420_csfl0_c: 14568.2
fguv_32x32xn_8bpc_420_csfl0_ssse3: 1162.3
fguv_32x32xn_8bpc_420_csfl1_c: 10682.0
fguv_32x32xn_8bpc_420_csfl1_ssse3: 910.3
fguv_32x32xn_8bpc_422_csfl0_c: 16370.5
fguv_32x32xn_8bpc_422_csfl0_ssse3: 1202.6
fguv_32x32xn_8bpc_422_csfl1_c: 11333.8
fguv_32x32xn_8bpc_422_csfl1_ssse3: 958.8
fguv_32x32xn_8bpc_444_csfl0_c: 12950.1
fguv_32x32xn_8bpc_444_csfl0_ssse3: 1133.6
fguv_32x32xn_8bpc_444_csfl1_c: 8806.7
fguv_32x32xn_8bpc_444_csfl1_ssse3: 731.0
This commit is contained in:
Ronald S. Bultje
2020-04-01 10:50:56 -04:00
parent b73acaa894
commit 4687c4696f
2 changed files with 299 additions and 58 deletions
+4
View File
@@ -34,6 +34,8 @@ decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
@@ -56,6 +58,8 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+295 -58
View File
@@ -2071,13 +2071,14 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
.end_hv:
RET
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
INIT_XMM ssse3
%if ARCH_X86_32
; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
; sby, luma, lstride, uv_pl, is_id)
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
tmp, src, scaling, h, fg_data, picptr, unused
mov r0, r0m
mov r1, r2m
@@ -2100,7 +2101,7 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
mov [rsp+8*mmsize+13*gprsize], r2
mov [rsp+8*mmsize+14*gprsize], r4
%else
cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
tmp, src, scaling, h, fg_data, picptr, unused
%endif
mov srcq, srcm
@@ -2125,13 +2126,13 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
%define base r5-pb_mask
mov r5m, r5
%else
cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
lea r8, [pb_mask]
%define base r8-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
movd m2, [base+byte_blend+3]
pcmpeqw m2, m2
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
lea tmpd, [r6d*2]
@@ -2143,6 +2144,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
movd m5, [base+min+r6*2]
cmovne r6d, tmpd
movd m4, [base+max+r6*2]
psrldq m2, 14+%2
punpcklwd m3, m3
punpcklwd m5, m5
punpcklwd m4, m4
@@ -2157,7 +2159,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
%macro FGUV_32x32xN_LOOP 1 ; not-csfl
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
@@ -2183,10 +2185,18 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
test overlapd, overlapd
jz %%no_vertical_overlap
%if ARCH_X86_32
%if %2
movd m1, [base+pb_23_22]
%else
movd m1, [base+pb_27_17_17_27]
%endif
mova m0, [base+pw_1024]
%else
%if %2
movd m1, [pb_23_22]
%else
movd m1, [pb_27_17_17_27]
%endif
mova m0, [pw_1024]
%endif
pshufd m1, m1, q0000
@@ -2216,7 +2226,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%define luma_bakq lumaq
mov wq, r4m
%if %3
shl r10mp, 1
%endif
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
@@ -2226,7 +2238,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov lumaq, r9mp
lea src_bakq, [srcq+wq]
lea luma_bakq, [lumaq+wq*2]
lea luma_bakq, [lumaq+wq*(1+%2)]
neg wq
sub r0mp, srcq
%if ARCH_X86_32
@@ -2237,7 +2249,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%else
mov r11mp, src_bakq
mov r10mp, strideq
mov r12mp, strideq
%endif
%%loop_x:
@@ -2266,8 +2278,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 82
lea offyq, [offyq+offxq+498] ; offy*stride+offx
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2276,6 +2288,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
%endif
%%loop_x_odd:
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y:
@@ -2283,6 +2296,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%if ARCH_X86_32
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2300,9 +2314,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2314,6 +2339,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[luma_src]
@@ -2364,8 +2392,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
add srcq, r2mp
; we already incremented lumaq above
%else
add srcq, r10mp
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*2]
%else
add lumaq, lstrideq
%endif
%endif
add grain_lutq, 82
dec hw
@@ -2384,11 +2416,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*2]
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2 == 0
; adjust top_offxy
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
add offxyd, 16
btc dword r8m, 2
jc %%loop_x_even
test dword r8m, 2
jz %%loop_x_odd
jmp %%loop_x_odd_v_overlap
%%loop_x_even:
%endif
test dword r8m, 1
jz %%loop_x
@@ -2400,8 +2447,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; horizontal overlap (without vertical overlap)
%%loop_x_h_overlap:
%if ARCH_X86_32
%if %2
lea r6, [offxyd+16]
mov [rsp+8*mmsize+0*gprsize], r6
%else
mov [rsp+8*mmsize+0*gprsize], offxyd
%endif
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
@@ -2410,7 +2461,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, lstride
%if %2
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
%else
mov left_offxyd, offyd
%endif
%endif
mov r6d, seed
or seed, 0xEFF4
@@ -2435,8 +2490,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 82
lea offyq, [offyq+offxq+498] ; offy*stride+offx
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2452,6 +2507,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%if ARCH_X86_32
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2469,9 +2525,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2483,6 +2550,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[luma_src]
@@ -2547,8 +2617,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
add srcq, r2mp
; lumaq has already been incremented above
%else
add srcq, r10mp
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*2]
%else
add lumaq, lstrideq
%endif
%endif
add grain_lutq, 82
dec hw
@@ -2567,17 +2641,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*2]
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2 == 0
xor dword r8m, 4
; adjust top_offxyd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
add offxyd, 16
%endif
; r8m = sbym
test dword r8m, 2
%if %2
jne %%loop_x_hv_overlap
jmp %%loop_x_h_overlap
%else
jne %%loop_x_odd_v_overlap
jmp %%loop_x_odd
%endif
%%end:
RET
@@ -2612,7 +2701,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov r3m, seed
mov wq, r4m
%if %3
shl r10mp, 1
%endif
%else
xor seed, sbyd ; (cur_seed << 16) | top_seed
@@ -2624,7 +2715,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov lumaq, r9mp
lea src_bakq, [srcq+wq]
lea luma_bakq, [lumaq+wq*2]
lea luma_bakq, [lumaq+wq*(1+%2)]
neg wq
sub r0mp, srcq
%if ARCH_X86_32
@@ -2635,7 +2726,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%else
mov r11mp, src_bakq
mov r10mp, strideq
mov r12mp, strideq
%endif
%%loop_x_v_overlap:
@@ -2674,9 +2765,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 82
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq+0x10001*498+16*82]
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
@@ -2693,12 +2784,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%endif
%%loop_x_odd_v_overlap:
mov hd, r7m
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
mova m1, [base+pb_27_17]
%else
mova m1, [pb_27_17]
%endif
%%loop_y_v_overlap:
%if ARCH_X86_32
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2716,9 +2815,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2730,6 +2840,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[luma_src]
@@ -2740,10 +2853,10 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
%endif
pcmpeqw m1, m1
psrlw m1, 8
pand m7, m1
pand m5, m1
pcmpeqw m4, m4
psrlw m4, 8
pand m7, m4
pand m5, m4
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
@@ -2753,17 +2866,22 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
movu m4, [grain_lutq+top_offxyq]
%endif
punpckhbw m1, m4, m3
punpckhbw m6, m4, m3
punpcklbw m4, m3
pmaddubsw m2, m9, m1
%if %3
pmaddubsw m2, m9, m6
pmaddubsw m3, m9, m4
%else
pmaddubsw m2, m1, m6
pmaddubsw m3, m1, m4
%endif
pmulhrsw m2, m8
pmulhrsw m3, m8
packsswb m3, m2
pxor m1, m1
pcmpgtb m1, m3
punpcklbw m2, m3, m1
punpckhbw m3, m1
pxor m6, m6
pcmpgtb m6, m3
punpcklbw m2, m3, m6
punpckhbw m3, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
pmullw m2, m7
@@ -2773,7 +2891,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; unpack chroma_source
pxor m4, m4
punpckhbw m1, m0, m4
punpckhbw m6, m0, m4
punpcklbw m0, m4 ; m0-1: src as word
%if ARCH_X86_32
@@ -2782,12 +2900,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m1, m3
paddw m6, m3
pmaxsw m0, m13
pmaxsw m1, m13
pmaxsw m6, m13
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
pminsw m6, m12
packuswb m0, m6
movifnidn dstq, dstmp
mova [dstq+srcq], m0
@@ -2797,10 +2915,24 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
add srcq, r2mp
; lumaq has already been incremented above
%else
add srcq, r10mp
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*2]
%else
add lumaq, lstrideq
%endif
%endif
add grain_lutq, 82
%if %3 == 0
btc hd, 16
%if ARCH_X86_32
mov r5, r5m
mova m1, [base+pb_17_27]
%else
mova m1, [pb_17_27]
%endif
jnc %%loop_y_v_overlap
%endif
jmp %%loop_y
%%end_y_v_overlap:
@@ -2817,25 +2949,40 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*2]
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2
; since fg_dataq.overlap is guaranteed to be set, we never jump
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
%else
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
add offxyd, 16
btc dword r8m, 2
jnc %%loop_x_odd_v_overlap
%endif
%%loop_x_hv_overlap:
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
mov r6, [rsp+8*mmsize+1*gprsize]
%if %2
lea r0, [r3d+16]
add r6, 16
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
%else
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
%endif
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
@@ -2846,8 +2993,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
%if %2
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offxyq+16]
%else
mov topleft_offxyq, top_offxyq
mov left_offxyq, offxyq
%endif
; we assume from the block above that bits 8-15 of tmpd are zero'ed
%endif
@@ -2881,9 +3033,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 82
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq+0x10001*498+16*82]
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
@@ -2900,6 +3052,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov hd, r7m
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
mova m3, [base+pb_27_17]
%else
mova m3, [pb_27_17]
%endif
%%loop_y_hv_overlap:
; src
%if ARCH_X86_32
@@ -2907,6 +3065,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov lumaq, r9mp
%endif
%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2924,9 +3083,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2938,6 +3108,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
; scaling[src]
@@ -2946,8 +3119,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
vpgatherdw m5, m6, scalingq, r0, r5
%else
movd m1, [grain_lutq+topleft_offxyq]
%if %3
vpgatherdw m7, m4, scalingq, r2, r12
vpgatherdw m5, m6, scalingq, r2, r12
%else
vpgatherdw m7, m4, scalingq, r2, r13
vpgatherdw m5, m6, scalingq, r2, r13
%endif
%endif
pcmpeqw m2, m2
psrlw m2, 8
@@ -2961,7 +3139,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
movd m1, [grain_lutq+r0]
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
%endif
movu m3, [grain_lutq+offxyq]
movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
@@ -2971,23 +3149,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
punpcklbw m4, m3
punpcklbw m4, m2
%if %2
punpcklwd m4, m1
%else
punpckldq m4, m1
%endif
pmaddubsw m1, m9, m4
pmulhrsw m1, m8
packsswb m1, m1
pandn m4, m10, m3
pandn m3, m10, m6
psrldq m6, m1, 1
pandn m4, m10, m2
pandn m2, m10, m6
psrldq m6, m1, 2-%2
pand m1, m10
pand m6, m10
por m4, m1
por m3, m6
por m2, m6
; followed by v interpolation (top | cur -> cur)
punpckhbw m1, m3, m4
punpcklbw m3, m4
punpckhbw m1, m2, m4
punpcklbw m2, m4
%if %3
pmaddubsw m4, m9, m1
pmaddubsw m1, m9, m3
pmaddubsw m1, m9, m2
%else
pmaddubsw m4, m3, m1
pmaddubsw m1, m3, m2
%endif
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
@@ -3008,17 +3195,17 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; unpack chroma source
pxor m4, m4
punpckhbw m3, m0, m4
punpckhbw m5, m0, m4
punpcklbw m0, m4 ; m0-1: src as word
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m3, m1
paddw m5, m1
pmaxsw m0, m13
pmaxsw m3, m13
pmaxsw m5, m13
pminsw m0, m12
pminsw m3, m12
packuswb m0, m3
pminsw m5, m12
packuswb m0, m5
movifnidn dstq, dstmp
mova [dstq+srcq], m0
@@ -3026,12 +3213,36 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
add srcq, r2mp
; lumaq has been adjusted above already
%else
add srcq, r10mp
lea lumaq, [lumaq+lstrideq*2]
add srcq, r12mp
%if %3
lea lumaq, [lumaq+lstrideq*(1+%2)]
%else
add lumaq, r10mp
%endif
%endif
add grain_lutq, 82
dec hw
%if %3
jg %%loop_y_h_overlap
%else
jle %%end_y_hv_overlap
%if ARCH_X86_32
mov r5, r5m
mova m3, [base+pb_17_27]
%else
mova m3, [pb_17_27]
%endif
btc hd, 16
jnc %%loop_y_hv_overlap
%if ARCH_X86_64
mov lstrideq, r10mp
%endif
jmp %%loop_y_h_overlap
%%end_y_hv_overlap:
%if ARCH_X86_64
mov lstrideq, r10mp
%endif
%endif
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
@@ -3046,18 +3257,44 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
mov srcq, r11mp
%endif
lea lumaq, [luma_bakq+wq*2]
lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
%if %2
jmp %%loop_x_hv_overlap
%else
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
add offxyd, 16
xor dword r8m, 4
jmp %%loop_x_odd_v_overlap
%endif
%%end_hv:
RET
%endmacro
FGUV_32x32xN_LOOP 1
%%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
FGUV_32x32xN_LOOP 0
%%FGUV_32x32xN_LOOP 0, %2, %3
%endmacro
FGUV_FN 420, 1, 1
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
%endif
FGUV_FN 422, 1, 0
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
%endif
FGUV_FN 444, 0, 0