mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
x86: add SSSE3 versions for filmgrain.fguv_32x32xn[422/444]
fguv_32x32xn_8bpc_420_csfl0_c: 14568.2 fguv_32x32xn_8bpc_420_csfl0_ssse3: 1162.3 fguv_32x32xn_8bpc_420_csfl1_c: 10682.0 fguv_32x32xn_8bpc_420_csfl1_ssse3: 910.3 fguv_32x32xn_8bpc_422_csfl0_c: 16370.5 fguv_32x32xn_8bpc_422_csfl0_ssse3: 1202.6 fguv_32x32xn_8bpc_422_csfl1_c: 11333.8 fguv_32x32xn_8bpc_422_csfl1_ssse3: 958.8 fguv_32x32xn_8bpc_444_csfl0_c: 12950.1 fguv_32x32xn_8bpc_444_csfl0_ssse3: 1133.6 fguv_32x32xn_8bpc_444_csfl1_c: 8806.7 fguv_32x32xn_8bpc_444_csfl1_ssse3: 731.0
This commit is contained in:
@@ -34,6 +34,8 @@ decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
|
||||
@@ -56,6 +58,8 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
+295
-58
@@ -2071,13 +2071,14 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
||||
.end_hv:
|
||||
RET
|
||||
|
||||
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_32
|
||||
; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
|
||||
; sby, luma, lstride, uv_pl, is_id)
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
|
||||
cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
|
||||
tmp, src, scaling, h, fg_data, picptr, unused
|
||||
mov r0, r0m
|
||||
mov r1, r2m
|
||||
@@ -2100,7 +2101,7 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
|
||||
mov [rsp+8*mmsize+13*gprsize], r2
|
||||
mov [rsp+8*mmsize+14*gprsize], r4
|
||||
%else
|
||||
cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
|
||||
cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
|
||||
tmp, src, scaling, h, fg_data, picptr, unused
|
||||
%endif
|
||||
mov srcq, srcm
|
||||
@@ -2125,13 +2126,13 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
|
||||
%define base r5-pb_mask
|
||||
mov r5m, r5
|
||||
%else
|
||||
cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
|
||||
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
|
||||
lea r8, [pb_mask]
|
||||
%define base r8-pb_mask
|
||||
%endif
|
||||
mov r6d, [fg_dataq+FGData.scaling_shift]
|
||||
movd m2, [base+byte_blend+3]
|
||||
pcmpeqw m2, m2
|
||||
movd m3, [base+mul_bits+r6*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
lea tmpd, [r6d*2]
|
||||
@@ -2143,6 +2144,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
movd m5, [base+min+r6*2]
|
||||
cmovne r6d, tmpd
|
||||
movd m4, [base+max+r6*2]
|
||||
psrldq m2, 14+%2
|
||||
punpcklwd m3, m3
|
||||
punpcklwd m5, m5
|
||||
punpcklwd m4, m4
|
||||
@@ -2157,7 +2159,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
||||
jne .csfl
|
||||
|
||||
%macro FGUV_32x32xN_LOOP 1 ; not-csfl
|
||||
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
|
||||
%else
|
||||
@@ -2183,10 +2185,18 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
test overlapd, overlapd
|
||||
jz %%no_vertical_overlap
|
||||
%if ARCH_X86_32
|
||||
%if %2
|
||||
movd m1, [base+pb_23_22]
|
||||
%else
|
||||
movd m1, [base+pb_27_17_17_27]
|
||||
%endif
|
||||
mova m0, [base+pw_1024]
|
||||
%else
|
||||
%if %2
|
||||
movd m1, [pb_23_22]
|
||||
%else
|
||||
movd m1, [pb_27_17_17_27]
|
||||
%endif
|
||||
mova m0, [pw_1024]
|
||||
%endif
|
||||
pshufd m1, m1, q0000
|
||||
@@ -2216,7 +2226,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%define luma_bakq lumaq
|
||||
|
||||
mov wq, r4m
|
||||
%if %3
|
||||
shl r10mp, 1
|
||||
%endif
|
||||
%else
|
||||
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
|
||||
unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
|
||||
@@ -2226,7 +2238,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
mov lumaq, r9mp
|
||||
lea src_bakq, [srcq+wq]
|
||||
lea luma_bakq, [lumaq+wq*2]
|
||||
lea luma_bakq, [lumaq+wq*(1+%2)]
|
||||
neg wq
|
||||
sub r0mp, srcq
|
||||
%if ARCH_X86_32
|
||||
@@ -2237,7 +2249,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
|
||||
%else
|
||||
mov r11mp, src_bakq
|
||||
mov r10mp, strideq
|
||||
mov r12mp, strideq
|
||||
%endif
|
||||
|
||||
%%loop_x:
|
||||
@@ -2266,8 +2278,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
ror offyd, 8
|
||||
shr offxd, 12
|
||||
and offyd, 0xf
|
||||
imul offyd, 82
|
||||
lea offyq, [offyq+offxq+498] ; offy*stride+offx
|
||||
imul offyd, 164>>%3
|
||||
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
|
||||
@@ -2276,6 +2288,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
|
||||
%endif
|
||||
|
||||
%%loop_x_odd:
|
||||
mov hd, r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
%%loop_y:
|
||||
@@ -2283,6 +2296,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%if ARCH_X86_32
|
||||
mov lumaq, r9mp
|
||||
%endif
|
||||
%if %2
|
||||
mova m4, [lumaq+ 0]
|
||||
mova m6, [lumaq+16]
|
||||
mova m0, [srcq]
|
||||
@@ -2300,9 +2314,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
pmaddubsw m6, m7
|
||||
pavgw m4, m2
|
||||
pavgw m6, m2
|
||||
%else
|
||||
mova m4, [lumaq]
|
||||
mova m0, [srcq]
|
||||
%if ARCH_X86_32
|
||||
add lumaq, r10mp
|
||||
mov r9mp, lumaq
|
||||
%endif
|
||||
pxor m2, m2
|
||||
%endif
|
||||
|
||||
%if %1
|
||||
%if %2
|
||||
packuswb m4, m6 ; luma
|
||||
%endif
|
||||
punpckhbw m6, m4, m0
|
||||
punpcklbw m4, m0 ; { luma, chroma }
|
||||
pmaddubsw m6, m14
|
||||
@@ -2314,6 +2339,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
packuswb m4, m6 ; pack+unpack = clip
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%elif %2 == 0
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%endif
|
||||
|
||||
; scaling[luma_src]
|
||||
@@ -2364,8 +2392,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
add srcq, r2mp
|
||||
; we already incremented lumaq above
|
||||
%else
|
||||
add srcq, r10mp
|
||||
add srcq, r12mp
|
||||
%if %3
|
||||
lea lumaq, [lumaq+lstrideq*2]
|
||||
%else
|
||||
add lumaq, lstrideq
|
||||
%endif
|
||||
%endif
|
||||
add grain_lutq, 82
|
||||
dec hw
|
||||
@@ -2384,11 +2416,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%else
|
||||
mov srcq, r11mp
|
||||
%endif
|
||||
lea lumaq, [luma_bakq+wq*2]
|
||||
lea lumaq, [luma_bakq+wq*(1+%2)]
|
||||
add srcq, wq
|
||||
%if ARCH_X86_32
|
||||
mov r4m, wq
|
||||
mov r9m, lumaq
|
||||
%endif
|
||||
%if %2 == 0
|
||||
; adjust top_offxy
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16
|
||||
%endif
|
||||
add offxyd, 16
|
||||
btc dword r8m, 2
|
||||
jc %%loop_x_even
|
||||
test dword r8m, 2
|
||||
jz %%loop_x_odd
|
||||
jmp %%loop_x_odd_v_overlap
|
||||
%%loop_x_even:
|
||||
%endif
|
||||
test dword r8m, 1
|
||||
jz %%loop_x
|
||||
@@ -2400,8 +2447,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
; horizontal overlap (without vertical overlap)
|
||||
%%loop_x_h_overlap:
|
||||
%if ARCH_X86_32
|
||||
%if %2
|
||||
lea r6, [offxyd+16]
|
||||
mov [rsp+8*mmsize+0*gprsize], r6
|
||||
%else
|
||||
mov [rsp+8*mmsize+0*gprsize], offxyd
|
||||
%endif
|
||||
|
||||
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
|
||||
|
||||
@@ -2410,7 +2461,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
|
||||
offx, offy, see, left_offxy, unused1, unused2, lstride
|
||||
|
||||
%if %2
|
||||
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
|
||||
%else
|
||||
mov left_offxyd, offyd
|
||||
%endif
|
||||
%endif
|
||||
mov r6d, seed
|
||||
or seed, 0xEFF4
|
||||
@@ -2435,8 +2490,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
ror offyd, 8
|
||||
shr offxd, 12
|
||||
and offyd, 0xf
|
||||
imul offyd, 82
|
||||
lea offyq, [offyq+offxq+498] ; offy*stride+offx
|
||||
imul offyd, 164>>%3
|
||||
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
|
||||
@@ -2452,6 +2507,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%if ARCH_X86_32
|
||||
mov lumaq, r9mp
|
||||
%endif
|
||||
%if %2
|
||||
mova m4, [lumaq+ 0]
|
||||
mova m6, [lumaq+16]
|
||||
mova m0, [srcq]
|
||||
@@ -2469,9 +2525,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
pmaddubsw m6, m7
|
||||
pavgw m4, m2
|
||||
pavgw m6, m2
|
||||
%else
|
||||
mova m4, [lumaq]
|
||||
mova m0, [srcq]
|
||||
%if ARCH_X86_32
|
||||
add lumaq, r10mp
|
||||
mov r9mp, lumaq
|
||||
%endif
|
||||
pxor m2, m2
|
||||
%endif
|
||||
|
||||
%if %1
|
||||
%if %2
|
||||
packuswb m4, m6 ; luma
|
||||
%endif
|
||||
punpckhbw m6, m4, m0
|
||||
punpcklbw m4, m0 ; { luma, chroma }
|
||||
pmaddubsw m6, m14
|
||||
@@ -2483,6 +2550,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
packuswb m4, m6 ; pack+unpack = clip
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%elif %2 == 0
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%endif
|
||||
|
||||
; scaling[luma_src]
|
||||
@@ -2547,8 +2617,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
add srcq, r2mp
|
||||
; lumaq has already been incremented above
|
||||
%else
|
||||
add srcq, r10mp
|
||||
add srcq, r12mp
|
||||
%if %3
|
||||
lea lumaq, [lumaq+lstrideq*2]
|
||||
%else
|
||||
add lumaq, lstrideq
|
||||
%endif
|
||||
%endif
|
||||
add grain_lutq, 82
|
||||
dec hw
|
||||
@@ -2567,17 +2641,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%else
|
||||
mov srcq, r11mp
|
||||
%endif
|
||||
lea lumaq, [luma_bakq+wq*2]
|
||||
lea lumaq, [luma_bakq+wq*(1+%2)]
|
||||
add srcq, wq
|
||||
%if ARCH_X86_32
|
||||
mov r4m, wq
|
||||
mov r9m, lumaq
|
||||
%endif
|
||||
%if %2 == 0
|
||||
xor dword r8m, 4
|
||||
; adjust top_offxyd
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16
|
||||
%endif
|
||||
add offxyd, 16
|
||||
%endif
|
||||
|
||||
; r8m = sbym
|
||||
test dword r8m, 2
|
||||
%if %2
|
||||
jne %%loop_x_hv_overlap
|
||||
jmp %%loop_x_h_overlap
|
||||
%else
|
||||
jne %%loop_x_odd_v_overlap
|
||||
jmp %%loop_x_odd
|
||||
%endif
|
||||
|
||||
%%end:
|
||||
RET
|
||||
@@ -2612,7 +2701,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
mov r3m, seed
|
||||
mov wq, r4m
|
||||
%if %3
|
||||
shl r10mp, 1
|
||||
%endif
|
||||
%else
|
||||
xor seed, sbyd ; (cur_seed << 16) | top_seed
|
||||
|
||||
@@ -2624,7 +2715,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
mov lumaq, r9mp
|
||||
lea src_bakq, [srcq+wq]
|
||||
lea luma_bakq, [lumaq+wq*2]
|
||||
lea luma_bakq, [lumaq+wq*(1+%2)]
|
||||
neg wq
|
||||
sub r0mp, srcq
|
||||
%if ARCH_X86_32
|
||||
@@ -2635,7 +2726,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
|
||||
%else
|
||||
mov r11mp, src_bakq
|
||||
mov r10mp, strideq
|
||||
mov r12mp, strideq
|
||||
%endif
|
||||
|
||||
%%loop_x_v_overlap:
|
||||
@@ -2674,9 +2765,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
ror offxd, 12
|
||||
and offyd, 0xf000f
|
||||
and offxd, 0xf000f
|
||||
imul offyd, 82
|
||||
imul offyd, 164>>%3
|
||||
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
||||
lea offyq, [offyq+offxq+0x10001*498+16*82]
|
||||
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
|
||||
@@ -2693,12 +2784,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
|
||||
%endif
|
||||
|
||||
%%loop_x_odd_v_overlap:
|
||||
mov hd, r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m1, [base+pb_27_17]
|
||||
%else
|
||||
mova m1, [pb_27_17]
|
||||
%endif
|
||||
%%loop_y_v_overlap:
|
||||
%if ARCH_X86_32
|
||||
mov lumaq, r9mp
|
||||
%endif
|
||||
%if %2
|
||||
mova m4, [lumaq+ 0]
|
||||
mova m6, [lumaq+16]
|
||||
mova m0, [srcq]
|
||||
@@ -2716,9 +2815,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
pmaddubsw m6, m7
|
||||
pavgw m4, m2
|
||||
pavgw m6, m2
|
||||
%else
|
||||
mova m4, [lumaq]
|
||||
mova m0, [srcq]
|
||||
%if ARCH_X86_32
|
||||
add lumaq, r10mp
|
||||
mov r9mp, lumaq
|
||||
%endif
|
||||
pxor m2, m2
|
||||
%endif
|
||||
|
||||
%if %1
|
||||
%if %2
|
||||
packuswb m4, m6 ; luma
|
||||
%endif
|
||||
punpckhbw m6, m4, m0
|
||||
punpcklbw m4, m0 ; { luma, chroma }
|
||||
pmaddubsw m6, m14
|
||||
@@ -2730,6 +2840,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
packuswb m4, m6 ; pack+unpack = clip
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%elif %2 == 0
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%endif
|
||||
|
||||
; scaling[luma_src]
|
||||
@@ -2740,10 +2853,10 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
%endif
|
||||
pcmpeqw m1, m1
|
||||
psrlw m1, 8
|
||||
pand m7, m1
|
||||
pand m5, m1
|
||||
pcmpeqw m4, m4
|
||||
psrlw m4, 8
|
||||
pand m7, m4
|
||||
pand m5, m4
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
@@ -2753,17 +2866,22 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%else
|
||||
movu m4, [grain_lutq+top_offxyq]
|
||||
%endif
|
||||
punpckhbw m1, m4, m3
|
||||
punpckhbw m6, m4, m3
|
||||
punpcklbw m4, m3
|
||||
pmaddubsw m2, m9, m1
|
||||
%if %3
|
||||
pmaddubsw m2, m9, m6
|
||||
pmaddubsw m3, m9, m4
|
||||
%else
|
||||
pmaddubsw m2, m1, m6
|
||||
pmaddubsw m3, m1, m4
|
||||
%endif
|
||||
pmulhrsw m2, m8
|
||||
pmulhrsw m3, m8
|
||||
packsswb m3, m2
|
||||
pxor m1, m1
|
||||
pcmpgtb m1, m3
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pxor m6, m6
|
||||
pcmpgtb m6, m3
|
||||
punpcklbw m2, m3, m6
|
||||
punpckhbw m3, m6
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
pmullw m2, m7
|
||||
@@ -2773,7 +2891,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
; unpack chroma_source
|
||||
pxor m4, m4
|
||||
punpckhbw m1, m0, m4
|
||||
punpckhbw m6, m0, m4
|
||||
punpcklbw m0, m4 ; m0-1: src as word
|
||||
|
||||
%if ARCH_X86_32
|
||||
@@ -2782,12 +2900,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddw m6, m3
|
||||
pmaxsw m0, m13
|
||||
pmaxsw m1, m13
|
||||
pmaxsw m6, m13
|
||||
pminsw m0, m12
|
||||
pminsw m1, m12
|
||||
packuswb m0, m1
|
||||
pminsw m6, m12
|
||||
packuswb m0, m6
|
||||
movifnidn dstq, dstmp
|
||||
mova [dstq+srcq], m0
|
||||
|
||||
@@ -2797,10 +2915,24 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
add srcq, r2mp
|
||||
; lumaq has already been incremented above
|
||||
%else
|
||||
add srcq, r10mp
|
||||
add srcq, r12mp
|
||||
%if %3
|
||||
lea lumaq, [lumaq+lstrideq*2]
|
||||
%else
|
||||
add lumaq, lstrideq
|
||||
%endif
|
||||
%endif
|
||||
add grain_lutq, 82
|
||||
%if %3 == 0
|
||||
btc hd, 16
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m1, [base+pb_17_27]
|
||||
%else
|
||||
mova m1, [pb_17_27]
|
||||
%endif
|
||||
jnc %%loop_y_v_overlap
|
||||
%endif
|
||||
jmp %%loop_y
|
||||
|
||||
%%end_y_v_overlap:
|
||||
@@ -2817,25 +2949,40 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%else
|
||||
mov srcq, r11mp
|
||||
%endif
|
||||
lea lumaq, [luma_bakq+wq*2]
|
||||
lea lumaq, [luma_bakq+wq*(1+%2)]
|
||||
add srcq, wq
|
||||
%if ARCH_X86_32
|
||||
mov r4m, wq
|
||||
mov r9m, lumaq
|
||||
%endif
|
||||
|
||||
%if %2
|
||||
; since fg_dataq.overlap is guaranteed to be set, we never jump
|
||||
; back to .loop_x_v_overlap, and instead always fall-through to
|
||||
; h+v overlap
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
add offxyd, 16
|
||||
btc dword r8m, 2
|
||||
jnc %%loop_x_odd_v_overlap
|
||||
%endif
|
||||
|
||||
%%loop_x_hv_overlap:
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
|
||||
|
||||
mov r6, [rsp+8*mmsize+1*gprsize]
|
||||
%if %2
|
||||
lea r0, [r3d+16]
|
||||
add r6, 16
|
||||
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
|
||||
%else
|
||||
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
|
||||
%endif
|
||||
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
|
||||
@@ -2846,8 +2993,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
|
||||
tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
|
||||
|
||||
%if %2
|
||||
lea topleft_offxyq, [top_offxyq+16]
|
||||
lea left_offxyq, [offxyq+16]
|
||||
%else
|
||||
mov topleft_offxyq, top_offxyq
|
||||
mov left_offxyq, offxyq
|
||||
%endif
|
||||
|
||||
; we assume from the block above that bits 8-15 of tmpd are zero'ed
|
||||
%endif
|
||||
@@ -2881,9 +3033,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
ror offxd, 12
|
||||
and offyd, 0xf000f
|
||||
and offxd, 0xf000f
|
||||
imul offyd, 82
|
||||
imul offyd, 164>>%3
|
||||
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
||||
lea offyq, [offyq+offxq+0x10001*498+16*82]
|
||||
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
|
||||
@@ -2900,6 +3052,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
mov hd, r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m3, [base+pb_27_17]
|
||||
%else
|
||||
mova m3, [pb_27_17]
|
||||
%endif
|
||||
%%loop_y_hv_overlap:
|
||||
; src
|
||||
%if ARCH_X86_32
|
||||
@@ -2907,6 +3065,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
mov lumaq, r9mp
|
||||
%endif
|
||||
%if %2
|
||||
mova m4, [lumaq+ 0]
|
||||
mova m6, [lumaq+16]
|
||||
mova m0, [srcq]
|
||||
@@ -2924,9 +3083,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
pmaddubsw m6, m7
|
||||
pavgw m4, m2
|
||||
pavgw m6, m2
|
||||
%else
|
||||
mova m4, [lumaq]
|
||||
mova m0, [srcq]
|
||||
%if ARCH_X86_32
|
||||
add lumaq, r10mp
|
||||
mov r9mp, lumaq
|
||||
%endif
|
||||
pxor m2, m2
|
||||
%endif
|
||||
|
||||
%if %1
|
||||
%if %2
|
||||
packuswb m4, m6 ; luma
|
||||
%endif
|
||||
punpckhbw m6, m4, m0
|
||||
punpcklbw m4, m0 ; { luma, chroma }
|
||||
pmaddubsw m6, m14
|
||||
@@ -2938,6 +3108,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
packuswb m4, m6 ; pack+unpack = clip
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%elif %2 == 0
|
||||
punpckhbw m6, m4, m2
|
||||
punpcklbw m4, m2
|
||||
%endif
|
||||
|
||||
; scaling[src]
|
||||
@@ -2946,8 +3119,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
%else
|
||||
movd m1, [grain_lutq+topleft_offxyq]
|
||||
%if %3
|
||||
vpgatherdw m7, m4, scalingq, r2, r12
|
||||
vpgatherdw m5, m6, scalingq, r2, r12
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r2, r13
|
||||
vpgatherdw m5, m6, scalingq, r2, r13
|
||||
%endif
|
||||
%endif
|
||||
pcmpeqw m2, m2
|
||||
psrlw m2, 8
|
||||
@@ -2961,7 +3139,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
movd m1, [grain_lutq+r0]
|
||||
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
|
||||
%endif
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
movu m2, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
movu m6, [grain_lutq+r5]
|
||||
movd m4, [grain_lutq+r0]
|
||||
@@ -2971,23 +3149,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%endif
|
||||
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m4, m3
|
||||
punpcklbw m4, m2
|
||||
%if %2
|
||||
punpcklwd m4, m1
|
||||
%else
|
||||
punpckldq m4, m1
|
||||
%endif
|
||||
pmaddubsw m1, m9, m4
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m1
|
||||
pandn m4, m10, m3
|
||||
pandn m3, m10, m6
|
||||
psrldq m6, m1, 1
|
||||
pandn m4, m10, m2
|
||||
pandn m2, m10, m6
|
||||
psrldq m6, m1, 2-%2
|
||||
pand m1, m10
|
||||
pand m6, m10
|
||||
por m4, m1
|
||||
por m3, m6
|
||||
por m2, m6
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m1, m3, m4
|
||||
punpcklbw m3, m4
|
||||
punpckhbw m1, m2, m4
|
||||
punpcklbw m2, m4
|
||||
%if %3
|
||||
pmaddubsw m4, m9, m1
|
||||
pmaddubsw m1, m9, m3
|
||||
pmaddubsw m1, m9, m2
|
||||
%else
|
||||
pmaddubsw m4, m3, m1
|
||||
pmaddubsw m1, m3, m2
|
||||
%endif
|
||||
pmulhrsw m4, m8
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m4
|
||||
@@ -3008,17 +3195,17 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
|
||||
; unpack chroma source
|
||||
pxor m4, m4
|
||||
punpckhbw m3, m0, m4
|
||||
punpckhbw m5, m0, m4
|
||||
punpcklbw m0, m4 ; m0-1: src as word
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m2
|
||||
paddw m3, m1
|
||||
paddw m5, m1
|
||||
pmaxsw m0, m13
|
||||
pmaxsw m3, m13
|
||||
pmaxsw m5, m13
|
||||
pminsw m0, m12
|
||||
pminsw m3, m12
|
||||
packuswb m0, m3
|
||||
pminsw m5, m12
|
||||
packuswb m0, m5
|
||||
movifnidn dstq, dstmp
|
||||
mova [dstq+srcq], m0
|
||||
|
||||
@@ -3026,12 +3213,36 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
add srcq, r2mp
|
||||
; lumaq has been adjusted above already
|
||||
%else
|
||||
add srcq, r10mp
|
||||
lea lumaq, [lumaq+lstrideq*2]
|
||||
add srcq, r12mp
|
||||
%if %3
|
||||
lea lumaq, [lumaq+lstrideq*(1+%2)]
|
||||
%else
|
||||
add lumaq, r10mp
|
||||
%endif
|
||||
%endif
|
||||
add grain_lutq, 82
|
||||
dec hw
|
||||
%if %3
|
||||
jg %%loop_y_h_overlap
|
||||
%else
|
||||
jle %%end_y_hv_overlap
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m3, [base+pb_17_27]
|
||||
%else
|
||||
mova m3, [pb_17_27]
|
||||
%endif
|
||||
btc hd, 16
|
||||
jnc %%loop_y_hv_overlap
|
||||
%if ARCH_X86_64
|
||||
mov lstrideq, r10mp
|
||||
%endif
|
||||
jmp %%loop_y_h_overlap
|
||||
%%end_y_hv_overlap:
|
||||
%if ARCH_X86_64
|
||||
mov lstrideq, r10mp
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
|
||||
@@ -3046,18 +3257,44 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
%else
|
||||
mov srcq, r11mp
|
||||
%endif
|
||||
lea lumaq, [luma_bakq+wq*2]
|
||||
lea lumaq, [luma_bakq+wq*(1+%2)]
|
||||
add srcq, wq
|
||||
%if ARCH_X86_32
|
||||
mov r4m, wq
|
||||
mov r9m, lumaq
|
||||
%endif
|
||||
%if %2
|
||||
jmp %%loop_x_hv_overlap
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
add offxyd, 16
|
||||
xor dword r8m, 4
|
||||
jmp %%loop_x_odd_v_overlap
|
||||
%endif
|
||||
|
||||
%%end_hv:
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
FGUV_32x32xN_LOOP 1
|
||||
%%FGUV_32x32xN_LOOP 1, %2, %3
|
||||
.csfl:
|
||||
FGUV_32x32xN_LOOP 0
|
||||
%%FGUV_32x32xN_LOOP 0, %2, %3
|
||||
%endmacro
|
||||
|
||||
FGUV_FN 420, 1, 1
|
||||
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
|
||||
%endif
|
||||
|
||||
FGUV_FN 422, 1, 0
|
||||
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
|
||||
%endif
|
||||
|
||||
FGUV_FN 444, 0, 0
|
||||
|
||||
Reference in New Issue
Block a user